From 1875eb6c6adf938a4bcf55af38e429d7096b0211 Mon Sep 17 00:00:00 2001 From: Luke Cheng <2258420+chenglu@users.noreply.github.com> Date: Sun, 27 Nov 2022 14:40:54 +0000 Subject: [PATCH] docs: fix: Accurate for the origin (English) subtitles --- .../00_welcome-to-the-hugging-face-course.srt | 539 +++++-- subtitles/en/01_the-pipeline-function.srt | 668 ++++++--- ...2_the-carbon-footprint-of-transformers.srt | 582 +++++++- subtitles/en/03_what-is-transfer-learning.srt | 599 +++++--- .../en/04_the-transformer-architecture.srt | 418 ++++-- .../en/05_transformer-models-encoders.srt | 678 ++++++--- .../en/06_transformer-models-decoders.srt | 658 +++++---- ...07_transformer-models-encoder-decoders.srt | 944 ++++++++---- ...inside-the-pipeline-function-(pytorch).srt | 715 ++++++--- ...ide-the-pipeline-function-(tensorflow).srt | 711 ++++++--- ...antiate-a-transformers-model-(pytorch).srt | 465 ++++-- ...iate-a-transformers-model-(tensorflow).srt | 512 ++++--- subtitles/en/12_tokenizers-overview.srt | 137 +- subtitles/en/13_word-based-tokenizers.srt | 392 +++-- .../en/14_character-based-tokenizers.srt | 412 ++++-- subtitles/en/15_subword-based-tokenizers.srt | 482 ++++-- subtitles/en/16_the-tokenization-pipeline.srt | 507 ++++--- .../17_batching-inputs-together-(pytorch).srt | 435 ++++-- ..._batching-inputs-together-(tensorflow).srt | 419 ++++-- ...gging-face-datasets-overview-(pytorch).srt | 505 ++++--- ...ng-face-datasets-overview-(tensorflow).srt | 484 +++--- ...preprocessing-sentence-pairs-(pytorch).srt | 443 ++++-- ...processing-sentence-pairs-(tensorflow).srt | 462 ++++-- subtitles/en/23_what-is-dynamic-padding.srt | 488 +++--- subtitles/en/24_the-trainer-api.srt | 556 ++++--- subtitles/en/25_keras-introduction.srt | 419 ++++-- .../en/26_fine-tuning-with-tensorflow.srt | 920 +++++++----- ...arning-rate-scheduling-with-tensorflow.srt | 647 +++++--- .../28_tensorflow-predictions-and-metrics.srt | 655 +++++--- ...29_write-your-training-loop-in-pytorch.srt | 868 ++++++----- ...-pytorch-training-loop-with-accelerate.srt | 495 ++++--- subtitles/en/31_navigating-the-model-hub.srt | 526 ++++--- .../32_managing-a-repo-on-the-model-hub.srt | 1096 +++++++++----- .../en/33_the-push-to-hub-api-(pytorch).srt | 723 ++++++--- .../34_the-push-to-hub-api-(tensorflow).srt | 1311 +++++++++++------ subtitles/en/35_loading-a-custom-dataset.srt | 472 ++++-- ...e-and-dice-a-dataset-\360\237\224\252.srt" | 573 ++++--- ...dataframes-=-\342\235\244\357\270\217.srt" | 427 ++++-- .../en/38_saving-and-reloading-a-dataset.srt | 538 ++++--- .../en/39_memory-mapping-&-streaming.srt | 553 ++++--- .../en/40_uploading-a-dataset-to-the-hub.srt | 337 +++-- .../41_text-embeddings-&-semantic-search.srt | 552 ++++--- subtitles/en/42_training-a-new-tokenizer.srt | 834 +++++++---- ...43_why-are-fast-tokenizers-called-fast.srt | 257 ++-- .../en/44_fast-tokenizer-superpowers.srt | 535 ++++--- ...oken-classification-pipeline-(pytorch).srt | 487 ++++-- ...n-classification-pipeline-(tensorflow).srt | 509 ++++--- ...-question-answering-pipeline-(pytorch).srt | 567 ++++--- ...estion-answering-pipeline-(tensorflow).srt | 535 ++++--- subtitles/en/49_what-is-normalization.srt | 664 +++++---- subtitles/en/50_what-is-pre-tokenization.srt | 308 ++-- .../en/51_byte-pair-encoding-tokenization.srt | 581 +++++--- subtitles/en/52_wordpiece-tokenization.srt | 444 ++++-- subtitles/en/53_unigram-tokenization.srt | 1151 +++++++++------ subtitles/en/54_building-a-new-tokenizer.srt | 641 +++++--- ...ta-processing-for-token-classification.srt | 500 ++++--- ...rocessing-for-masked-language-modeling.srt | 373 +++-- subtitles/en/57_what-is-perplexity.srt | 330 +++-- subtitles/en/58_what-is-domain-adaptation.srt | 274 ++-- .../en/59_data-processing-for-translation.srt | 405 +++-- subtitles/en/60_what-is-the-bleu-metric.srt | 814 ++++++---- .../61_data-processing-for-summarization.srt | 325 ++-- subtitles/en/62_what-is-the-rouge-metric.srt | 689 ++++++--- ...rocessing-for-causal-language-modeling.srt | 629 +++++--- .../en/64_using-a-custom-loss-function.srt | 494 ++++--- ...data-processing-for-question-answering.srt | 462 +++--- ...g-step-in-question-answering-(pytorch).srt | 511 ++++--- ...tep-in-question-answering-(tensorflow).srt | 498 ++++--- subtitles/en/68_data-collators-a-tour.srt | 996 ++++++++----- .../69_what-to-do-when-you-get-an-error.srt | 405 +++-- .../en/70_using-a-debugger-in-a-notebook.srt | 451 ++++-- .../en/71_using-a-debugger-in-a-terminal.srt | 503 +++++-- .../en/72_asking-for-help-on-the-forums.srt | 520 ++++--- ...ugging-the-training-pipeline-(pytorch).srt | 662 ++++++--- ...ing-the-training-pipeline-(tensorflow).srt | 1108 ++++++++++---- subtitles/en/75_writing-a-good-issue.srt | 494 ++++--- 76 files changed, 28836 insertions(+), 14443 deletions(-) diff --git a/subtitles/en/00_welcome-to-the-hugging-face-course.srt b/subtitles/en/00_welcome-to-the-hugging-face-course.srt index f84bf031a..ae8eb7042 100644 --- a/subtitles/en/00_welcome-to-the-hugging-face-course.srt +++ b/subtitles/en/00_welcome-to-the-hugging-face-course.srt @@ -1,84 +1,455 @@ -1 -00:00:05,760 --> 00:00:10,480 -Welcome to the Hugging Face Course! This  -course has been designed to teach you all   - -2 -00:00:10,480 --> 00:00:15,200 -about the Hugging Face ecosystem: how to  -use the dataset and model hub as well as   - -3 -00:00:15,200 --> 00:00:20,880 -all our open source libraries. Here is  -the Table of Contents. As you can see,   - -4 -00:00:20,880 --> 00:00:26,000 -it's divided in three sections which become  -progressively more advanced. At this stage,   - -5 -00:00:26,000 --> 00:00:30,560 -the first two sections have been released. The  -first will teach you the basics of how to use   - -6 -00:00:30,560 --> 00:00:35,680 -a Transformer model, fine-tune it on your own  -dataset and share the result with the community.   - -7 -00:00:36,800 --> 00:00:42,400 -The second will dive deeper into our libraries  -and teach you how to tackle any NLP task. We are   - -8 -00:00:42,400 --> 00:00:46,960 -actively working on the last one and hope to  -have it ready for you for the spring of 2022.   - -9 -00:00:48,320 --> 00:00:52,320 -The first chapter requires no technical knowledge  -and is a good introduction to learn what   - -10 -00:00:52,320 --> 00:00:59,040 -Transformers models can do and how they could be  -of use to you or your company. The next chapters   - -11 -00:00:59,040 --> 00:01:03,280 -require a good knowledge of Python and some basic  -knowledge of Machine Learning and Deep Learning.   - -12 -00:01:04,160 --> 00:01:09,120 -If you don't know what a training and validation  -set is or what gradient descent means,   - -13 -00:01:09,120 --> 00:01:13,600 -you should look at an introductory course such as  -the ones published by deeplearning.ai or fast.ai.   - -14 -00:01:16,000 --> 00:01:20,400 -It's also best if you have some basics in one  -Deep Learning Framework (PyTorch or TensorFlow).   - -15 -00:01:20,960 --> 00:01:25,280 -Each part of the material introduced in this  -course has a version in both those frameworks,   - -16 -00:01:25,280 --> 00:01:30,160 -so you will be able to pick the one you are  -most comfortable with. This is the team that   - -17 -00:01:30,160 --> 00:01:34,240 -developed this course. I'll now let each of  -the speakers introduce themselves briefly. +1 +00:00:05,850 --> 00:00:07,713 +- Welcome to the Hugging Face Course. + +2 +00:00:08,550 --> 00:00:10,320 +This course has been designed to teach you + +3 +00:00:10,320 --> 00:00:12,750 +all about the Hugging Face ecosystem, + +4 +00:00:12,750 --> 00:00:14,700 +how to use the dataset and model hub + +5 +00:00:14,700 --> 00:00:16,803 +as well as all our open-source libraries. + +6 +00:00:18,300 --> 00:00:19,950 +Here is the Table of Contents. + +7 +00:00:19,950 --> 00:00:22,770 +As you can see, it's +divided in three sections + +8 +00:00:22,770 --> 00:00:25,110 +which become progressively more advanced. + +9 +00:00:25,110 --> 00:00:28,500 +At this stage, the first two +sections have been released. + +10 +00:00:28,500 --> 00:00:30,120 +So first, we'll teach you the basics + +11 +00:00:30,120 --> 00:00:32,250 +of how to use a Transformer model, + +12 +00:00:32,250 --> 00:00:34,230 +fine-tune it on your own data set + +13 +00:00:34,230 --> 00:00:36,960 +and share the result with the community. + +14 +00:00:36,960 --> 00:00:39,420 +So second, we'll dive +deeper into our libraries + +15 +00:00:39,420 --> 00:00:42,360 +and teach you how to tackle any NLP task. + +16 +00:00:42,360 --> 00:00:44,430 +We're actively working on the last one + +17 +00:00:44,430 --> 00:00:47,280 +and hope to have it ready for +you for the spring of 2022. + +18 +00:00:48,510 --> 00:00:50,880 +The first chapter requires +no technical knowledge + +19 +00:00:50,880 --> 00:00:52,320 +and is a good introduction to learn + +20 +00:00:52,320 --> 00:00:54,180 +what Transformers models can do + +21 +00:00:54,180 --> 00:00:56,883 +and how it could be of use +to you or your company. + +22 +00:00:58,050 --> 00:01:01,110 +The next chapters require +a good knowledge of Python + +23 +00:01:01,110 --> 00:01:02,130 +and some basic knowledge of + +24 +00:01:02,130 --> 00:01:04,350 +Machine Learning and Deep Learning. + +25 +00:01:04,350 --> 00:01:07,110 +If you don't know what a +training and validation set are + +26 +00:01:07,110 --> 00:01:09,360 +or what gradient decent means, + +27 +00:01:09,360 --> 00:01:11,340 +you should look at an introductory course + +28 +00:01:11,340 --> 00:01:14,863 +such as the ones published by +deeplearning.ai or fast.ai. + +29 +00:01:16,200 --> 00:01:17,910 +It's also best if you have some basics + +30 +00:01:17,910 --> 00:01:21,150 +in one Deep Learning Framework, +PyTorch or TensorFlow. + +31 +00:01:21,150 --> 00:01:23,520 +Each part of the material +introduced in this course + +32 +00:01:23,520 --> 00:01:25,590 +has a version in both those frameworks, + +33 +00:01:25,590 --> 00:01:26,730 +so you will be able to pick the one + +34 +00:01:26,730 --> 00:01:28,230 +you are most comfortable with. + +35 +00:01:29,550 --> 00:01:31,740 +This is the team that +developed this course. + +36 +00:01:31,740 --> 00:01:33,120 +I'll now let each of the speakers + +37 +00:01:33,120 --> 00:01:34,570 +introduce themselves briefly. + +38 +00:01:37,230 --> 00:01:38,880 +- Hi, my name is Matthew, + +39 +00:01:38,880 --> 00:01:41,610 +and I'm a Machine Learning +Engineer at Hugging Face. + +40 +00:01:41,610 --> 00:01:43,200 +I work on the open-source team + +41 +00:01:43,200 --> 00:01:45,180 +and I'm responsible for +maintaining particularly + +42 +00:01:45,180 --> 00:01:47,280 +the TensorFlow code there. + +43 +00:01:47,280 --> 00:01:50,130 +Previously, I was a Machine +Learning Engineer at Parsley, + +44 +00:01:50,130 --> 00:01:52,620 +who've recently been +acquired by Automatic, + +45 +00:01:52,620 --> 00:01:54,210 +and I was a postdoctoral researcher + +46 +00:01:54,210 --> 00:01:57,000 +before that at Trinity +College, Dublin in Ireland + +47 +00:01:57,000 --> 00:02:00,093 +working on computational +genetics and retinal disease. + +48 +00:02:02,400 --> 00:02:03,870 +- Hi, I'm Lysandre. + +49 +00:02:03,870 --> 00:02:05,640 +I'm a Machine Learning +Engineer at Hugging Face + +50 +00:02:05,640 --> 00:02:08,700 +and I'm specifically part +of the open-source team. + +51 +00:02:08,700 --> 00:02:10,890 +I've been at Hugging +Face for a few years now + +52 +00:02:10,890 --> 00:02:12,300 +and alongside my team members, + +53 +00:02:12,300 --> 00:02:13,890 +I've been working on most of the tools + +54 +00:02:13,890 --> 00:02:15,790 +that you'll get to see in this course. + +55 +00:02:18,270 --> 00:02:20,130 +- Hi, I'm Sylvain. + +56 +00:02:20,130 --> 00:02:22,140 +I'm a Research Engineer at Hugging Face + +57 +00:02:22,140 --> 00:02:25,830 +and one of the main maintainers +of the Transformers Library. + +58 +00:02:25,830 --> 00:02:28,110 +Previously, I worked at fast.ai + +59 +00:02:28,110 --> 00:02:30,420 +where I helped develop the fast.ai Library + +60 +00:02:30,420 --> 00:02:32,220 +as well as the online book. + +61 +00:02:32,220 --> 00:02:35,340 +Before that, I was a math +and computer science teacher + +62 +00:02:35,340 --> 00:02:36,173 +in France. + +63 +00:02:38,550 --> 00:02:41,340 +- Hi, my name is Sasha and I'm +a Researcher at Hugging Face, + +64 +00:02:41,340 --> 00:02:42,420 +working on the ethical, + +65 +00:02:42,420 --> 00:02:46,230 +environmental and social impacts +of machine learning models. + +66 +00:02:46,230 --> 00:02:49,020 +Previously, I was a +postdoctoral researcher at Mila, + +67 +00:02:49,020 --> 00:02:50,400 +University in Montreal + +68 +00:02:50,400 --> 00:02:53,040 +and I also worked as an +Applied AI Researcher + +69 +00:02:53,040 --> 00:02:55,140 +for the United Nations Global Pulse. + +70 +00:02:55,140 --> 00:02:57,300 +I've been involved in +projects such as CodeCarbon + +71 +00:02:57,300 --> 00:02:59,790 +and the Machine Learning +Impacts Calculator + +72 +00:02:59,790 --> 00:03:02,390 +to measure the carbon +footprint of machine learning. + +73 +00:03:05,160 --> 00:03:07,650 +- Hi, I'm Merve and I'm +a Developer Advocate + +74 +00:03:07,650 --> 00:03:09,390 +at Hugging Face. + +75 +00:03:09,390 --> 00:03:12,480 +Previously, I was working as +a Machine Learning Engineer + +76 +00:03:12,480 --> 00:03:15,360 +building NLP tools and chat bots. + +77 +00:03:15,360 --> 00:03:17,670 +Currently, I'm working to improve the hub + +78 +00:03:17,670 --> 00:03:19,563 +and democratize machine learning. + +79 +00:03:22,140 --> 00:03:23,670 +- Hello everyone. + +80 +00:03:23,670 --> 00:03:27,210 +My name is Lucile and I'm +a Machine Learning Engineer + +81 +00:03:27,210 --> 00:03:28,353 +at Hugging Face. + +82 +00:03:29,580 --> 00:03:32,550 +To tell you in two sentences who I am, + +83 +00:03:32,550 --> 00:03:35,590 +I work on the development and +support of open-source tools + +84 +00:03:36,600 --> 00:03:39,595 +and I also participate in +several research project + +85 +00:03:39,595 --> 00:03:41,795 +in the field of Natural +Language Processing. + +86 +00:03:44,610 --> 00:03:45,540 +- Good day there. + +87 +00:03:45,540 --> 00:03:47,550 +I'm Lewis and I'm a +Machine Learning Engineer + +88 +00:03:47,550 --> 00:03:50,130 +in the open-source team at Hugging Face. + +89 +00:03:50,130 --> 00:03:53,490 +I'm passionate about developing +tools for the NLP community + +90 +00:03:53,490 --> 00:03:55,050 +and you'll see me at +many of Hugging Face's + +91 +00:03:55,050 --> 00:03:56,910 +outreach activities. + +92 +00:03:56,910 --> 00:03:58,470 +Before joining Hugging Face, + +93 +00:03:58,470 --> 00:03:59,790 +I spent several years developing + +94 +00:03:59,790 --> 00:04:01,860 +machine learning applications for startups + +95 +00:04:01,860 --> 00:04:04,230 +and enterprises in the domains of NLP, + +96 +00:04:04,230 --> 00:04:07,260 +topological data analysis and time series. + +97 +00:04:07,260 --> 00:04:10,110 +In a former life, I was +a theoretical physicist, + +98 +00:04:10,110 --> 00:04:11,760 +where I researched particle collisions + +99 +00:04:11,760 --> 00:04:13,560 +at the Large Hadron Collider and so. + +100 +00:04:15,900 --> 00:04:18,450 +- Hey, I'm Leandro and I'm +a Machine Learning Engineer + +101 +00:04:18,450 --> 00:04:21,030 +in the open-source team at Hugging Face. + +102 +00:04:21,030 --> 00:04:23,460 +Before joining Hugging Face, +I worked as a Data Scientist + +103 +00:04:23,460 --> 00:04:26,733 +in Switzerland and have taught +Data Science at University. + diff --git a/subtitles/en/01_the-pipeline-function.srt b/subtitles/en/01_the-pipeline-function.srt index c5fff35db..44c3bf8bb 100644 --- a/subtitles/en/01_the-pipeline-function.srt +++ b/subtitles/en/01_the-pipeline-function.srt @@ -1,223 +1,445 @@ -1 -00:00:05,680 --> 00:00:06,720 -The pipeline function.   - -2 -00:00:09,360 --> 00:00:13,280 -The pipeline function is the most  -high-level API of the Transformers library.   - -3 -00:00:13,840 --> 00:00:21,200 -It regroups together all the steps to go from raw  -texts to usable predictions. The model used is at   - -4 -00:00:21,200 --> 00:00:26,720 -the core of a pipeline, but the pipeline also  -include all the necessary pre-processing (since   - -5 -00:00:26,720 --> 00:00:32,800 -the model does not expect texts, but numbers) as  -well as some post-processing to make the output of   - -6 -00:00:32,800 --> 00:00:39,440 -the model human-readable. Let's look at a first  -example with the sentiment analysis pipeline.   - -7 -00:00:40,480 --> 00:00:46,080 -This pipeline performs text classification on a  -given input, and determines if it's positive or   - -8 -00:00:46,080 --> 00:00:53,120 -negative. Here, it attributed the positive label  -on the given text, with a confidence of 95%.   - -9 -00:00:55,440 --> 00:00:59,520 -You can pass multiple texts to the  -same pipeline, which will be processed   - -10 -00:00:59,520 --> 00:01:05,840 -and passed through the model together, as a  -batch. The output is a list of individual results,   - -11 -00:01:05,840 --> 00:01:12,080 -in the same order as the input texts. Here we  -find the same label and score for the first text,   - -12 -00:01:12,080 --> 00:01:16,480 -and the second text is judged  -positive with a confidence of 99.99%.   - -13 -00:01:18,480 --> 00:01:22,720 -The zero-shot classification pipeline is a  -more general text-classification pipeline:   - -14 -00:01:23,360 --> 00:01:28,320 -it allows you to provide the labels you  -want. Here we want to classify our input   - -15 -00:01:28,320 --> 00:01:35,360 -text along the labels "education", "politics" and  -"business". The pipeline successfully recognizes   - -16 -00:01:35,360 --> 00:01:39,360 -it's more about education than the  -other labels, with a confidence of 84%.   - -17 -00:01:41,440 --> 00:01:47,360 -Moving on to other tasks, the text generation  -pipeline will auto-complete a given prompt. The   - -18 -00:01:47,360 --> 00:01:52,560 -output is generated with a bit of randomness, so  -it changes each time you call the generator object   - -19 -00:01:52,560 --> 00:01:58,960 -on a given prompt. Up until now, we have used the  -pipeline API with the default model associated to   - -20 -00:01:58,960 --> 00:02:03,920 -each task, but you can use it with any model that  -has been pretrained or fine-tuned on this task.   - -21 -00:02:06,320 --> 00:02:12,320 -Going on the model hub (huggingface.co/models),  -you can filter the available models by task.   - -22 -00:02:13,120 --> 00:02:16,960 -The default model used in our  -previous example was gpt2,   - -23 -00:02:16,960 --> 00:02:20,080 -but there are many more models  -available, and not just in English!   - -24 -00:02:21,280 --> 00:02:27,120 -Let's go back to the text generation pipeline and  -load it with another model, distilgpt2. This is   - -25 -00:02:27,120 --> 00:02:33,120 -a lighter version of gpt2 created by the Hugging  -Face team. When applying the pipeline to a given   - -26 -00:02:33,120 --> 00:02:39,280 -prompt, we can specify several arguments, such as  -the maximum length of the generated texts, or the   - -27 -00:02:39,280 --> 00:02:43,520 -number of sentences we want to return (since  -there is some randomness in the generation).   - -28 -00:02:45,920 --> 00:02:50,480 -Generating text by guessing the next word in a  -sentence was the pretraining objective of GPT-2,   - -29 -00:02:51,200 --> 00:02:56,240 -the fill mask pipeline is the pretraining  -objective of BERT, which is to guess the value   - -30 -00:02:56,240 --> 00:03:02,480 -of masked word. In this case, we ask the two most  -likely values for the missing words (according to   - -31 -00:03:02,480 --> 00:03:09,120 -the model) and get mathematical or computational  -as possible answers. Another task Transformers   - -32 -00:03:09,120 --> 00:03:13,920 -model can perform is to classify each word in  -the sentence instead of the sentence as a whole.   - -33 -00:03:14,720 --> 00:03:21,040 -One example of this is Named Entity Recognition,  -which is the task of identifying entities, such as   - -34 -00:03:21,040 --> 00:03:29,360 -persons, organizations or locations in a sentence.  -Here, the model correctly finds the person   - -35 -00:03:29,360 --> 00:03:36,000 -(Sylvain), the organization (Hugging Face) as well  -as the location (Brooklyn) inside the input text.   - -36 -00:03:37,440 --> 00:03:42,080 -The grouped_entities=True argument used  -is to make the pipeline group together   - -37 -00:03:42,080 --> 00:03:46,080 -the different words linked to the same  -entity (such as Hugging and Face here).   - -38 -00:03:48,000 --> 00:03:52,160 -Another task available with the pipeline  -API is extractive question answering.   - -39 -00:03:52,720 --> 00:03:58,080 -Providing a context and a question, the model  -will identify the span of text in the context   - -40 -00:03:58,080 --> 00:04:03,920 -containing the answer to the question. Getting  -short summaries of very long articles is   - -41 -00:04:03,920 --> 00:04:07,840 -also something the Transformers library can  -help with, with the summarization pipeline.   - -42 -00:04:09,360 --> 00:04:15,040 -Finally, the last task supported by the  -pipeline API is translation. Here we use   - -43 -00:04:15,040 --> 00:04:19,440 -a French/English model found on the model hub  -to get the English version of our input text.   - -44 -00:04:21,360 --> 00:04:24,720 -Here is a brief summary of all the  -tasks we looked into in this video.   - -45 -00:04:25,280 --> 00:04:27,840 -Try then out through the inference  -widgets in the model hub! +1 +00:00:00,069 --> 00:00:01,341 +(screen whooshes) + +2 +00:00:01,341 --> 00:00:02,449 +(face logo whooshes) + +3 +00:00:02,449 --> 00:00:05,880 +(screen whooshes) + +4 +00:00:05,880 --> 00:00:07,080 +- The pipeline function. + +5 +00:00:09,540 --> 00:00:12,020 +The pipeline function is +the most high level API + +6 +00:00:12,020 --> 00:00:14,010 +of the Transformers library. + +7 +00:00:14,010 --> 00:00:16,050 +It regroups together all the steps + +8 +00:00:16,050 --> 00:00:18,873 +to go from raw texts +to usable predictions. + +9 +00:00:20,228 --> 00:00:22,980 +The model used is at +the core of a pipeline, + +10 +00:00:22,980 --> 00:00:24,390 +but the pipeline also include + +11 +00:00:24,390 --> 00:00:26,610 +all the necessary pre-processing, + +12 +00:00:26,610 --> 00:00:30,240 +since the model does not +expect texts, but number, + +13 +00:00:30,240 --> 00:00:32,040 +as well as some post-processing, + +14 +00:00:32,040 --> 00:00:34,533 +to make the output of +the model human-readable. + +15 +00:00:35,910 --> 00:00:37,593 +Let's look at a first example + +16 +00:00:37,593 --> 00:00:39,693 +with the sentiment analysis pipeline. + +17 +00:00:40,740 --> 00:00:44,670 +This pipeline performs text +classification on a given input + +18 +00:00:44,670 --> 00:00:46,953 +and determines if it's +positive or negative. + +19 +00:00:47,910 --> 00:00:51,750 +Here, it attributed the positive +label on the given text, + +20 +00:00:51,750 --> 00:00:54,413 +with a confidence of 95%. + +21 +00:00:55,650 --> 00:00:58,470 +You can pass multiple +texts to the same pipeline, + +22 +00:00:58,470 --> 00:01:00,270 +which will be processed and passed + +23 +00:01:00,270 --> 00:01:02,673 +through the model together as a batch. + +24 +00:01:03,570 --> 00:01:05,970 +The output is a list of individual results + +25 +00:01:05,970 --> 00:01:07,923 +in the same order as the input texts. + +26 +00:01:08,790 --> 00:01:12,270 +Here we find the same label +and score for the first text, + +27 +00:01:12,270 --> 00:01:14,443 +and the second text is judged negative + +28 +00:01:14,443 --> 00:01:17,243 +with a confidence of 99.9%. + +29 +00:01:18,720 --> 00:01:20,700 +The zero-shot classification pipeline + +30 +00:01:20,700 --> 00:01:23,610 +is a more general +text-classification pipeline, + +31 +00:01:23,610 --> 00:01:26,370 +it allows you to provide +the labels you want. + +32 +00:01:26,370 --> 00:01:29,850 +Here we want to classify our +input text along the labels, + +33 +00:01:29,850 --> 00:01:32,643 +education, politics, and business. + +34 +00:01:33,540 --> 00:01:35,580 +The pipeline successfully recognizes + +35 +00:01:35,580 --> 00:01:38,280 +it's more about education +than the other labels, + +36 +00:01:38,280 --> 00:01:40,643 +with a confidence of 84%. + +37 +00:01:41,670 --> 00:01:43,110 +Moving on to other tasks, + +38 +00:01:43,110 --> 00:01:45,030 +the text generation pipeline will + +39 +00:01:45,030 --> 00:01:46,533 +auto-complete a given prompt. + +40 +00:01:47,460 --> 00:01:49,980 +The output is generated +with a bit of randomness, + +41 +00:01:49,980 --> 00:01:52,800 +so it changes each time you +call the generator object + +42 +00:01:52,800 --> 00:01:53,763 +on a given prompt. + +43 +00:01:54,990 --> 00:01:57,123 +Up until now, we've used +the the pipeline API + +44 +00:01:57,123 --> 00:02:00,360 +with the default model +associated to each task, + +45 +00:02:00,360 --> 00:02:02,880 +but you can use it with any +model that has been pretrained + +46 +00:02:02,880 --> 00:02:04,263 +or fine-tuned on this task. + +47 +00:02:06,540 --> 00:02:10,350 +Going on the model hub, +huggingface.co/models + +48 +00:02:10,350 --> 00:02:13,350 +you can filter the +available models by task. + +49 +00:02:13,350 --> 00:02:17,190 +The default model used in our +previous example was gpt2, + +50 +00:02:17,190 --> 00:02:19,290 +but there are many more models available, + +51 +00:02:19,290 --> 00:02:20,523 +and not just in English. + +52 +00:02:21,450 --> 00:02:23,670 +Let's go back to the +text generation pipeline + +53 +00:02:23,670 --> 00:02:26,193 +and load it with another +model, distilgpt2. + +54 +00:02:27,060 --> 00:02:28,950 +This is a lighter version of gpt2 + +55 +00:02:28,950 --> 00:02:30,603 +created by the Hugging Face team. + +56 +00:02:31,740 --> 00:02:34,110 +When applying the pipeline +to a given prompt, + +57 +00:02:34,110 --> 00:02:36,360 +we can specify several arguments + +58 +00:02:36,360 --> 00:02:39,240 +such as the maximum length +of the generated texts, + +59 +00:02:39,240 --> 00:02:41,700 +or the number of sentences +we want to return, + +60 +00:02:41,700 --> 00:02:44,150 +since there is some +randomness in the generation. + +61 +00:02:46,080 --> 00:02:48,750 +Generating texts by guessing +the next word in a sentence + +62 +00:02:48,750 --> 00:02:51,450 +was the pretraining objective of GPT-2. + +63 +00:02:51,450 --> 00:02:55,140 +The fill mask pipeline is the +pretraining objective of BERT, + +64 +00:02:55,140 --> 00:02:57,363 +which is to guess the +value of masked word. + +65 +00:02:58,260 --> 00:03:01,020 +In this case, we ask the +two most likely values + +66 +00:03:01,020 --> 00:03:03,660 +for the missing words, +according to the model, + +67 +00:03:03,660 --> 00:03:07,053 +and get mathematical or +computational as possible answers. + +68 +00:03:08,280 --> 00:03:10,170 +Another task Transformers +model can perform + +69 +00:03:10,170 --> 00:03:12,660 +is to classify each word in the sentence + +70 +00:03:12,660 --> 00:03:14,970 +instead of the sentence as a whole. + +71 +00:03:14,970 --> 00:03:18,390 +One example of this is +Named Entity Recognition, + +72 +00:03:18,390 --> 00:03:20,820 +which is the task of identifying entities, + +73 +00:03:20,820 --> 00:03:25,323 +such as persons, organizations +or locations in a sentence. + +74 +00:03:26,400 --> 00:03:30,570 +Here, the model correctly +finds the person, Sylvain, + +75 +00:03:30,570 --> 00:03:32,453 +the organization, Hugging Face, + +76 +00:03:32,453 --> 00:03:35,010 +as well as the location, Brooklyn, + +77 +00:03:35,010 --> 00:03:36,303 +inside the input text. + +78 +00:03:37,661 --> 00:03:40,230 +The grouped_entities=True argument used + +79 +00:03:40,230 --> 00:03:42,330 +is to make the pipeline group together + +80 +00:03:42,330 --> 00:03:44,790 +the different words +linked to the same entity, + +81 +00:03:44,790 --> 00:03:46,353 +such as Hugging and Face here. + +82 +00:03:48,270 --> 00:03:50,670 +Another task available +with the pipeline API + +83 +00:03:50,670 --> 00:03:52,920 +is extractive question answering. + +84 +00:03:52,920 --> 00:03:55,380 +Providing a context and a question, + +85 +00:03:55,380 --> 00:03:58,290 +the model will identify the +span of text in the context + +86 +00:03:58,290 --> 00:04:00,190 +containing the answer to the question. + +87 +00:04:01,650 --> 00:04:03,960 +Getting short summaries +of very long articles + +88 +00:04:03,960 --> 00:04:06,540 +is also something the Transformers +library can help with, + +89 +00:04:06,540 --> 00:04:08,140 +with the summarization pipeline. + +90 +00:04:09,480 --> 00:04:12,570 +Finally, the last task +supported by the pipeline API + +91 +00:04:12,570 --> 00:04:14,130 +is translation. + +92 +00:04:14,130 --> 00:04:16,170 +Here we use a French/English model + +93 +00:04:16,170 --> 00:04:17,460 +found on the model hub + +94 +00:04:17,460 --> 00:04:19,893 +to get the English +version of our input text. + +95 +00:04:21,600 --> 00:04:23,490 +Here is a brief summary of all the tasks + +96 +00:04:23,490 --> 00:04:25,500 +we've looked into in this video. + +97 +00:04:25,500 --> 00:04:27,390 +Try then out through the inference widgets + +98 +00:04:27,390 --> 00:04:28,327 +in the model hub. + +99 +00:04:30,459 --> 00:04:33,475 +(screen whooshes) + +100 +00:04:33,475 --> 00:04:35,175 +(logo whooshes) + diff --git a/subtitles/en/02_the-carbon-footprint-of-transformers.srt b/subtitles/en/02_the-carbon-footprint-of-transformers.srt index 5147f8e12..101d676a1 100644 --- a/subtitles/en/02_the-carbon-footprint-of-transformers.srt +++ b/subtitles/en/02_the-carbon-footprint-of-transformers.srt @@ -1 +1,581 @@ -No transcript found for this video! \ No newline at end of file +1 +00:00:05,580 --> 00:00:08,820 +- So let's talk about the carbon +footprint of transformers. + +2 +00:00:08,820 --> 00:00:10,530 +Maybe you've seen +headlines such as this one + +3 +00:00:10,530 --> 00:00:13,530 +that training a single AI +model can emit as much carbon + +4 +00:00:13,530 --> 00:00:16,020 +as five cars in their lifetimes. + +5 +00:00:16,020 --> 00:00:19,440 +So when is this true +and is it always true? + +6 +00:00:19,440 --> 00:00:21,803 +Well, it actually depends +on several things. + +7 +00:00:21,803 --> 00:00:23,430 +Most importantly, it depends + +8 +00:00:23,430 --> 00:00:24,960 +on the type of energy you're using. + +9 +00:00:24,960 --> 00:00:26,267 +If you're using renewable energy such as + +10 +00:00:26,267 --> 00:00:30,670 +solar, wind, hydroelectricity, +you're really + +11 +00:00:30,670 --> 00:00:33,810 +not emitting any carbon +at all, very, very little. + +12 +00:00:33,810 --> 00:00:36,769 +If you're using non-renewable +energy sources such as coal + +13 +00:00:36,769 --> 00:00:39,570 +then their carbon +footprint is a lot higher + +14 +00:00:39,570 --> 00:00:43,260 +'cuz essentially you are emitting +a lot of greenhouse gases. + +15 +00:00:43,260 --> 00:00:44,670 +Another aspect is training time. + +16 +00:00:44,670 --> 00:00:47,232 +So the longer you train, +the more energy you use + +17 +00:00:47,232 --> 00:00:50,250 +the more energy you use, the +more carbon you emit, right? + +18 +00:00:50,250 --> 00:00:51,270 +So this really adds up + +19 +00:00:51,270 --> 00:00:53,520 +especially if you're +training large models for + +20 +00:00:53,520 --> 00:00:56,460 +for hours and days and weeks. + +21 +00:00:56,460 --> 00:00:58,380 +The hardware you use also matters + +22 +00:00:58,380 --> 00:01:00,930 +because some GPUs, for +example, are more efficient + +23 +00:01:00,930 --> 00:01:05,460 +than others and utilizing +efficiency use properly. + +24 +00:01:05,460 --> 00:01:07,500 +So using them a hundred +percent all the time + +25 +00:01:07,500 --> 00:01:10,650 +can really reduce the energy +consumption that you have. + +26 +00:01:10,650 --> 00:01:13,290 +And then once again, reduce +your carbon footprint. + +27 +00:01:13,290 --> 00:01:15,870 +There's also other aspects such as IO + +28 +00:01:15,870 --> 00:01:17,730 +such as data, et cetera, et cetera. + +29 +00:01:17,730 --> 00:01:20,940 +But these are the main three +that you should focus on. + +30 +00:01:20,940 --> 00:01:23,340 +So when I talk about energy +sources and carbon intensity + +31 +00:01:23,340 --> 00:01:24,420 +what does that really mean? + +32 +00:01:24,420 --> 00:01:27,480 +So if you look at the top of the screen + +33 +00:01:27,480 --> 00:01:30,480 +you have a carbon footprint + +34 +00:01:30,480 --> 00:01:33,860 +of a cloud computing +instance in Mumbai, India + +35 +00:01:33,860 --> 00:01:38,700 +which emits 920 grams of +CO2 per kilowatt hour. + +36 +00:01:38,700 --> 00:01:40,110 +This is almost one kilogram + +37 +00:01:40,110 --> 00:01:43,680 +of CO2 per kilowatt hour +of electricity used. + +38 +00:01:43,680 --> 00:01:45,150 +If you compare that with Canada, Montreal + +39 +00:01:45,150 --> 00:01:48,720 +where I am right now, 20 +grams of CO2 per kilo hour. + +40 +00:01:48,720 --> 00:01:50,040 +So that's a really, really big difference. + +41 +00:01:50,040 --> 00:01:54,240 +Almost more than 40 +times more carbon emitted + +42 +00:01:54,240 --> 00:01:55,950 +in Mumbai versus Montreal. + +43 +00:01:55,950 --> 00:01:57,720 +And so this can really, really add up. + +44 +00:01:57,720 --> 00:01:59,820 +If you're training a model +for weeks, for example + +45 +00:01:59,820 --> 00:02:01,920 +you're multiplying times 40 + +46 +00:02:01,920 --> 00:02:03,450 +the carbon that you're emitting. + +47 +00:02:03,450 --> 00:02:05,070 +So choosing the right instance + +48 +00:02:05,070 --> 00:02:07,080 +choosing a low carbon compute instance + +49 +00:02:07,080 --> 00:02:09,690 +is really the most impactful +thing that you can do. + +50 +00:02:09,690 --> 00:02:13,020 +And this is where it can really add up + +51 +00:02:13,020 --> 00:02:15,930 +if you're training in a very intensive + +52 +00:02:15,930 --> 00:02:17,580 +in a very carbon intensive region + +53 +00:02:19,170 --> 00:02:21,750 +other elements to consider, for example + +54 +00:02:21,750 --> 00:02:22,770 +using pre-trained models + +55 +00:02:22,770 --> 00:02:25,590 +that's the machine learning +equivalent of recycling. + +56 +00:02:25,590 --> 00:02:28,292 +When you have pre-trained +models available using them + +57 +00:02:28,292 --> 00:02:30,120 +you're not emitting any +carbon at all, right? + +58 +00:02:30,120 --> 00:02:31,230 +You're not retraining anything. + +59 +00:02:31,230 --> 00:02:33,450 +So that's also doing your homework + +60 +00:02:33,450 --> 00:02:35,574 +and kind of looking around +what already exists. + +61 +00:02:35,574 --> 00:02:37,890 +Fine tuning instead of +training from scratch. + +62 +00:02:37,890 --> 00:02:38,723 +So once again + +63 +00:02:38,723 --> 00:02:40,590 +if you find a model that +is almost what you need + +64 +00:02:40,590 --> 00:02:43,530 +but not quite fine tuning +the last couple of layers + +65 +00:02:43,530 --> 00:02:45,210 +making it really fit your purpose instead + +66 +00:02:45,210 --> 00:02:46,500 +of training a large transformer + +67 +00:02:46,500 --> 00:02:48,810 +from scratch can really help, + +68 +00:02:48,810 --> 00:02:51,270 +starting with smaller experiments + +69 +00:02:51,270 --> 00:02:52,800 +and debugging as you go. + +70 +00:02:52,800 --> 00:02:54,630 +So that means, for example, training + +71 +00:02:54,630 --> 00:02:58,770 +figuring out data encoding, +figuring out, you know + +72 +00:02:58,770 --> 00:03:01,170 +making sure that there's +no small bugs, that you'll + +73 +00:03:01,170 --> 00:03:03,840 +you'll realize, you know, +16 hours into training + +74 +00:03:03,840 --> 00:03:05,820 +starting small and really making sure + +75 +00:03:05,820 --> 00:03:08,760 +that what you're doing, what +your code is, is stable. + +76 +00:03:08,760 --> 00:03:11,430 +And then finally doing +a literature review to + +77 +00:03:11,430 --> 00:03:13,740 +choose hyper parameter +ranges and then following + +78 +00:03:13,740 --> 00:03:15,900 +up with a random search +instead of a grid search. + +79 +00:03:15,900 --> 00:03:18,420 +So random searches for hyper parameters + +80 +00:03:18,420 --> 00:03:21,300 +combinations have actually +shown to be as efficient + +81 +00:03:21,300 --> 00:03:24,000 +in finding the optimal +configuration as grid search. + +82 +00:03:24,000 --> 00:03:27,510 +But obviously you're not trying +all possible combinations + +83 +00:03:27,510 --> 00:03:29,520 +you're only trying a subset of them. + +84 +00:03:29,520 --> 00:03:31,800 +So this can really help as well. + +85 +00:03:31,800 --> 00:03:32,760 +So now if we go back + +86 +00:03:32,760 --> 00:03:36,300 +to the original paper by +Strubell et all in 2019 + +87 +00:03:36,300 --> 00:03:39,180 +the infamous five cars +in their lifetimes paper. + +88 +00:03:39,180 --> 00:03:40,013 +If you just look + +89 +00:03:40,013 --> 00:03:43,606 +at a transformer of 200 +million perimeter transformer + +90 +00:03:43,606 --> 00:03:46,950 +it is carbon footprint is +around 200 pounds of CO2 + +91 +00:03:46,950 --> 00:03:47,940 +which is significant + +92 +00:03:47,940 --> 00:03:49,980 +but it's nowhere near five cars, right? + +93 +00:03:49,980 --> 00:03:52,893 +It's not even a transatlantic flight. + +94 +00:03:52,893 --> 00:03:55,020 +How it really adds up is when you're doing + +95 +00:03:55,020 --> 00:03:56,190 +neural architecture search + +96 +00:03:56,190 --> 00:03:58,560 +when you're doing hyper +parameter tuning, and + +97 +00:03:58,560 --> 00:04:00,930 +this is trying all possible combinations + +98 +00:04:00,930 --> 00:04:01,763 +et cetera, et cetera. + +99 +00:04:01,763 --> 00:04:02,596 +And this is where + +100 +00:04:02,596 --> 00:04:05,400 +like the 600,000 pounds of CO2 came from. + +101 +00:04:05,400 --> 00:04:08,490 +So this is really where things add up. + +102 +00:04:08,490 --> 00:04:11,880 +So, but if you're doing things +mindfully and conscientiously + +103 +00:04:11,880 --> 00:04:16,410 +then your carbon footprint +wont be as big as, + +104 +00:04:16,410 --> 00:04:20,040 +as the paper implied, some tools to figure + +105 +00:04:20,040 --> 00:04:22,111 +out how much CO2 exactly you're emitting. + +106 +00:04:22,111 --> 00:04:24,270 +There's a web-based tool called machine + +107 +00:04:24,270 --> 00:04:26,430 +learning submissions +calculator, which allows you + +108 +00:04:26,430 --> 00:04:29,010 +to manually input, for example, +which hardware you used + +109 +00:04:29,010 --> 00:04:30,488 +how many hours you used it for + +110 +00:04:30,488 --> 00:04:34,260 +where it was located +locally or in the cloud. + +111 +00:04:34,260 --> 00:04:35,640 +And then it's gonna give you an estimate + +112 +00:04:35,640 --> 00:04:37,560 +of how much CO2 you emitted. + +113 +00:04:37,560 --> 00:04:40,200 +Another tool that does +this programmatically, + +114 +00:04:40,200 --> 00:04:41,190 +is called code carbon. + +115 +00:04:41,190 --> 00:04:45,112 +So you can PIP install it, you +can, you can go to the GitHub + +116 +00:04:45,112 --> 00:04:48,120 +and essentially it runs +in parallel to your code. + +117 +00:04:48,120 --> 00:04:49,085 +So essentially you call it + +118 +00:04:49,085 --> 00:04:51,060 +and then you do all your training. + +119 +00:04:51,060 --> 00:04:53,760 +And then at the end it's +gonna give you an estimate + +120 +00:04:53,760 --> 00:04:57,210 +a CSV file with an +estimate of your emissions. + +121 +00:04:57,210 --> 00:04:59,250 +And it's gonna give you some comparisons. + +122 +00:04:59,250 --> 00:05:01,230 +It's got a visual UI +where you can really look + +123 +00:05:01,230 --> 00:05:04,680 +at how this compares to +driving a car or watching TV. + +124 +00:05:04,680 --> 00:05:06,060 +So it can give you an idea + +125 +00:05:06,060 --> 00:05:07,740 +of the scope of your emissions as well. + +126 +00:05:07,740 --> 00:05:09,930 +And actually, code carbon is +already integrated into auto + +127 +00:05:09,930 --> 00:05:12,270 +and LP and hopefully +people will be using it + +128 +00:05:12,270 --> 00:05:15,240 +out of the box and easily +tracking their emissions all + +129 +00:05:15,240 --> 00:05:17,523 +through training and +deploying transformers. + diff --git a/subtitles/en/03_what-is-transfer-learning.srt b/subtitles/en/03_what-is-transfer-learning.srt index 29212cdc2..80c9ddeac 100644 --- a/subtitles/en/03_what-is-transfer-learning.srt +++ b/subtitles/en/03_what-is-transfer-learning.srt @@ -1,203 +1,396 @@ -1 -00:00:05,440 --> 00:00:07,120 -What is transfer learning?   - -2 -00:00:09,360 --> 00:00:13,760 -The idea of Transfer Learning is to leverage the  -knowledge acquired by a model trained with lots of   - -3 -00:00:13,760 --> 00:00:20,720 -data on another task. The model A will be trained  -specifically for task A. Now, let's say you want   - -4 -00:00:20,720 --> 00:00:26,320 -to train a model B for a different task. One  -option would be to train the model from scratch.   - -5 -00:00:27,120 --> 00:00:34,240 -This could take lots of computation, time and  -data. Instead, we could initialize model B with   - -6 -00:00:34,240 --> 00:00:38,880 -the same weights as model A, transferring  -the knowledge of model A on task B.   - -7 -00:00:40,800 --> 00:00:47,040 -When training from scratch, all the model’s  -weight are initialized randomly. In this example,   - -8 -00:00:47,040 --> 00:00:52,480 -we are training a BERT model on the task of  -recognizing if two sentences are similar or not.   - -9 -00:00:53,680 --> 00:00:58,560 -On the left, it’s trained from scratch, and  -on the right, it’s fine-tuning a pretrained   - -10 -00:00:58,560 --> 00:01:04,080 -model. As we can see, using transfer learning  -and the pretrained model yields better results.   - -11 -00:01:04,960 --> 00:01:09,360 -And it doesn’t matter if we train longer, the  -training from scratch is capped around 70%   - -12 -00:01:09,360 --> 00:01:13,040 -accuracy while the pretrained  -model beats the 86% easily.   - -13 -00:01:14,240 --> 00:01:18,720 -This is because pretrained models are usually  -trained on large amounts of data that provide   - -14 -00:01:18,720 --> 00:01:22,720 -the model with a statistical understanding  -of the language used during pretraining.   - -15 -00:01:24,240 --> 00:01:28,960 -In computer vision, transfer learning has been  -applied successfully for almost ten years.   - -16 -00:01:29,840 --> 00:01:35,840 -Models are frequently pretrained on ImageNet, a  -dataset containing 1.2 millions of photo images.   - -17 -00:01:36,880 --> 00:01:41,680 -Each image is classified by one of  -1000 labels. Training like this,   - -18 -00:01:42,240 --> 00:01:48,960 -on labeled data is called supervised  -learning. In Natural Language Processing,   - -19 -00:01:48,960 --> 00:01:54,320 -transfer learning is a bit more recent. A key  -difference with ImageNet is that the pretraining   - -20 -00:01:54,320 --> 00:01:59,280 -is usually self-supervised, which means it  -doesn’t require humans annotations for the labels.   - -21 -00:02:00,480 --> 00:02:05,040 -A very common pretraining objective is  -to guess the next word in a sentence,   - -22 -00:02:05,040 --> 00:02:08,720 -which only requires lots and  -lots of text. GPT-2 for instance,   - -23 -00:02:09,360 --> 00:02:16,720 -was pretrained this way using the content of 45  -millions links posted by users on Reddit. Another   - -24 -00:02:16,720 --> 00:02:21,520 -example of self-supervised pretraining objective  -is to predict the value of randomly masked words,   - -25 -00:02:22,160 --> 00:02:25,360 -which is similar to fill-in-the-blank  -tests you may have done in school.   - -26 -00:02:26,560 --> 00:02:31,520 -BERT was pretrained this way using the English  -Wikipedia and 11,000 unpublished books.   - -27 -00:02:32,960 --> 00:02:38,880 -In practice, transfer learning is applied on a  -given model by throwing away its head, that is,   - -28 -00:02:38,880 --> 00:02:43,680 -its last layers focused on the pretraining  -objective, and replacing it with a new,   - -29 -00:02:43,680 --> 00:02:50,000 -randomly initialized, head suitable for the task  -at hand. For instance, when we fine-tuned a BERT   - -30 -00:02:50,000 --> 00:02:55,440 -model earlier, we removed the head that classified  -mask words and replaced it with a classifier with   - -31 -00:02:55,440 --> 00:03:01,680 -2 outputs, since our task had two labels. To  -be as efficient as possible, the pretrained   - -32 -00:03:01,680 --> 00:03:07,200 -model used should be as similar as possible  -to the task it’s fine-tuned on. For instance,   - -33 -00:03:07,200 --> 00:03:12,720 -if the problem it’s to classify German sentences,  -it’s best to use a German pretrained model.   - -34 -00:03:14,160 --> 00:03:19,200 -But with the good comes the bad. The pretrained  -model does not only transfer its knowledge,   - -35 -00:03:19,200 --> 00:03:25,440 -but also any bias it may contain. ImageNet mostly  -contains images coming from the United States and   - -36 -00:03:25,440 --> 00:03:29,680 -Western Europe, so models fine-tuned with it  -usually will perform better on images from   - -37 -00:03:29,680 --> 00:03:35,280 -these countries. OpenAI also studied the  -bias in the predictions of its GPT-3 model   - -38 -00:03:35,840 --> 00:03:40,960 -(which was pretrained using the guess the next  -work objective). Changing the gender of the prompt   - -39 -00:03:40,960 --> 00:03:46,720 -from "He was very" to "She was very" changed  -the predictions from mostly neutral adjectives   - -40 -00:03:47,360 --> 00:03:52,240 -to almost only physical ones. In  -their model card of the GPT-2 model,   - -41 -00:03:52,240 --> 00:03:59,840 -OpenAI also acknowledges its bias and discourages  -its use in systems that interact with humans. +1 +00:00:00,189 --> 00:00:02,856 +(air whooshing) + +2 +00:00:05,550 --> 00:00:07,293 +- What is transfer learning? + +3 +00:00:09,480 --> 00:00:10,920 +The idea of transfer learning + +4 +00:00:10,920 --> 00:00:12,570 +is to leverage the knowledge acquired + +5 +00:00:12,570 --> 00:00:15,543 +by a model trained with lots +of data on another task. + +6 +00:00:16,410 --> 00:00:20,130 +The model A will be trained +specifically for task A. + +7 +00:00:20,130 --> 00:00:22,200 +Now let's say you want to train a model B + +8 +00:00:22,200 --> 00:00:23,970 +for a different task. + +9 +00:00:23,970 --> 00:00:27,330 +One option would be to train +the model from scratch. + +10 +00:00:27,330 --> 00:00:30,633 +This could take lots of +computation, time and data. + +11 +00:00:31,470 --> 00:00:34,260 +Instead, we could initialize model B + +12 +00:00:34,260 --> 00:00:36,570 +with the same weights as model A, + +13 +00:00:36,570 --> 00:00:39,213 +transferring the knowledge +of model A on task B. + +14 +00:00:41,040 --> 00:00:42,690 +When training from scratch, + +15 +00:00:42,690 --> 00:00:45,870 +all the model's weight +are initialized randomly. + +16 +00:00:45,870 --> 00:00:48,870 +In this example, we are +training a BERT model + +17 +00:00:48,870 --> 00:00:50,220 +on the task of recognizing + +18 +00:00:50,220 --> 00:00:52,203 +if two sentences are similar or not. + +19 +00:00:54,116 --> 00:00:56,730 +On the left, it's trained from scratch, + +20 +00:00:56,730 --> 00:01:00,000 +and on the right it's +fine-tuning a pretrained model. + +21 +00:01:00,000 --> 00:01:02,220 +As we can see, using transfer learning + +22 +00:01:02,220 --> 00:01:05,160 +and the pretrained model +yields better results. + +23 +00:01:05,160 --> 00:01:07,140 +And it doesn't matter if we train longer. + +24 +00:01:07,140 --> 00:01:10,620 +The training from scratch is +capped around 70% accuracy + +25 +00:01:10,620 --> 00:01:13,293 +while the pretrained model +beats the 86% easily. + +26 +00:01:14,460 --> 00:01:16,140 +This is because pretrained models + +27 +00:01:16,140 --> 00:01:18,420 +are usually trained on +large amounts of data + +28 +00:01:18,420 --> 00:01:21,000 +that provide the model with +a statistical understanding + +29 +00:01:21,000 --> 00:01:23,413 +of the language used during pretraining. + +30 +00:01:24,450 --> 00:01:25,950 +In computer vision, + +31 +00:01:25,950 --> 00:01:28,080 +transfer learning has +been applied successfully + +32 +00:01:28,080 --> 00:01:30,060 +for almost ten years. + +33 +00:01:30,060 --> 00:01:32,850 +Models are frequently +pretrained on ImageNet, + +34 +00:01:32,850 --> 00:01:36,153 +a dataset containing 1.2 +millions of photo images. + +35 +00:01:37,170 --> 00:01:41,130 +Each image is classified +by one of 1000 labels. + +36 +00:01:41,130 --> 00:01:44,010 +Training like this, on labeled data + +37 +00:01:44,010 --> 00:01:45,663 +is called supervised learning. + +38 +00:01:47,340 --> 00:01:49,140 +In Natural Language Processing, + +39 +00:01:49,140 --> 00:01:51,870 +transfer learning is a bit more recent. + +40 +00:01:51,870 --> 00:01:54,480 +A key difference with ImageNet +is that the pretraining + +41 +00:01:54,480 --> 00:01:56,460 +is usually self-supervised, + +42 +00:01:56,460 --> 00:01:58,770 +which means it doesn't +require humans annotations + +43 +00:01:58,770 --> 00:01:59,673 +for the labels. + +44 +00:02:00,780 --> 00:02:02,700 +A very common pretraining objective + +45 +00:02:02,700 --> 00:02:05,310 +is to guess the next word in a sentence. + +46 +00:02:05,310 --> 00:02:07,710 +Which only requires lots and lots of text. + +47 +00:02:07,710 --> 00:02:10,710 +GPT-2 for instance, +was pretrained this way + +48 +00:02:10,710 --> 00:02:12,900 +using the content of 45 millions links + +49 +00:02:12,900 --> 00:02:14,673 +posted by users on Reddit. + +50 +00:02:16,560 --> 00:02:19,590 +Another example of self-supervised +pretraining objective + +51 +00:02:19,590 --> 00:02:22,470 +is to predict the value +of randomly masked words. + +52 +00:02:22,470 --> 00:02:24,540 +Which is similar to +fill-in-the-blank tests + +53 +00:02:24,540 --> 00:02:26,760 +you may have done in school. + +54 +00:02:26,760 --> 00:02:29,880 +BERT was pretrained this way +using the English Wikipedia + +55 +00:02:29,880 --> 00:02:31,893 +and 11,000 unpublished books. + +56 +00:02:33,120 --> 00:02:36,450 +In practice, transfer learning +is applied on a given model + +57 +00:02:36,450 --> 00:02:39,090 +by throwing away its head, that is, + +58 +00:02:39,090 --> 00:02:42,150 +its last layers focused on +the pretraining objective, + +59 +00:02:42,150 --> 00:02:45,360 +and replacing it with a new, +randomly initialized head + +60 +00:02:45,360 --> 00:02:46,860 +suitable for the task at hand. + +61 +00:02:47,970 --> 00:02:51,570 +For instance, when we +fine-tuned a BERT model earlier, + +62 +00:02:51,570 --> 00:02:54,060 +we removed the head that +classified mask words + +63 +00:02:54,060 --> 00:02:56,790 +and replaced it with a +classifier with 2 outputs. + +64 +00:02:56,790 --> 00:02:58,563 +Since our task had two labels. + +65 +00:02:59,700 --> 00:03:02,490 +To be as efficient as possible, +the pretrained model used + +66 +00:03:02,490 --> 00:03:03,770 +should be as similar as possible + +67 +00:03:03,770 --> 00:03:06,270 +to the task it's fine-tuned on. + +68 +00:03:06,270 --> 00:03:08,190 +For instance, if the problem + +69 +00:03:08,190 --> 00:03:10,860 +is to classify German sentences, + +70 +00:03:10,860 --> 00:03:13,053 +it's best to use a +German pretrained model. + +71 +00:03:14,370 --> 00:03:16,649 +But with the good comes the bad. + +72 +00:03:16,649 --> 00:03:19,380 +The pretrained model does not +only transfer its knowledge, + +73 +00:03:19,380 --> 00:03:21,693 +but also any bias it may contain. + +74 +00:03:22,530 --> 00:03:24,300 +ImageNet mostly contains images + +75 +00:03:24,300 --> 00:03:26,850 +coming from the United +States and Western Europe. + +76 +00:03:26,850 --> 00:03:28,020 +So models fine-tuned with it + +77 +00:03:28,020 --> 00:03:31,710 +usually will perform better on +images from these countries. + +78 +00:03:31,710 --> 00:03:33,690 +OpenAI also studied the bias + +79 +00:03:33,690 --> 00:03:36,120 +in the predictions of its GPT-3 model + +80 +00:03:36,120 --> 00:03:36,953 +which was pretrained + +81 +00:03:36,953 --> 00:03:38,750 +using the guess the next word objective. + +82 +00:03:39,720 --> 00:03:41,040 +Changing the gender of the prompt + +83 +00:03:41,040 --> 00:03:44,250 +from he was very to she was very + +84 +00:03:44,250 --> 00:03:47,550 +changed the predictions from +mostly neutral adjectives + +85 +00:03:47,550 --> 00:03:49,233 +to almost only physical ones. + +86 +00:03:50,400 --> 00:03:52,367 +In their model card of the GPT-2 model, + +87 +00:03:52,367 --> 00:03:54,990 +OpenAI also acknowledges its bias + +88 +00:03:54,990 --> 00:03:56,730 +and discourages its use + +89 +00:03:56,730 --> 00:03:58,803 +in systems that interact with humans. + +90 +00:04:01,040 --> 00:04:03,707 +(air whooshing) + diff --git a/subtitles/en/04_the-transformer-architecture.srt b/subtitles/en/04_the-transformer-architecture.srt index 19fd99fb0..aecc03a08 100644 --- a/subtitles/en/04_the-transformer-architecture.srt +++ b/subtitles/en/04_the-transformer-architecture.srt @@ -1,138 +1,280 @@ -1 -00:00:04,960 --> 00:00:07,120 -Let's study the transformer architecture.   - -2 -00:00:08,960 --> 00:00:13,840 -This video is the introductory video to  -the encoders, decoders, and encoder-decoder   - -3 -00:00:13,840 --> 00:00:18,640 -series of videos. In this series, we'll try to  -understand what makes a Transformer network,   - -4 -00:00:18,640 --> 00:00:24,720 -and we'll try to explain it in simple, high-level  -terms. No understanding of neural networks is   - -5 -00:00:24,720 --> 00:00:29,840 -necessary, only an understanding of  -basic vectors and tensors may help.   - -6 -00:00:32,320 --> 00:00:36,480 -To get started, we'll take up this diagram  -from the original transformer paper,   - -7 -00:00:36,480 --> 00:00:42,640 -entitled "Attention is all you need". As we'll  -see here we can leverage only some parts of it,   - -8 -00:00:42,640 --> 00:00:48,080 -according to what we're trying to do. We won't  -dive into the specific layers building up that   - -9 -00:00:48,080 --> 00:00:52,560 -architecture, but we'll try to understand the  -different ways this architecture can be used.   - -10 -00:00:54,960 --> 00:00:59,760 -Let's first start by splitting that architecture  -into two parts. On the left we have the encoder,   - -11 -00:00:59,760 --> 00:01:04,320 -and on the right, the decoder. These two can  -be used together, but they can also be used   - -12 -00:01:04,320 --> 00:01:11,280 -independently! Let's understand how these work:  -The encoder accepts inputs that represent text.   - -13 -00:01:11,280 --> 00:01:17,200 -It converts this text, these words, into numerical  -representations. These numerical representations   - -14 -00:01:17,200 --> 00:01:23,120 -can also be called embeddings, or features. We'll  -see that it uses the self-attention mechanism as   - -15 -00:01:23,120 --> 00:01:29,840 -its main component. We recommend you check out the  -video on encoders especially to understand what is   - -16 -00:01:29,840 --> 00:01:36,640 -this numerical representation, as well as how it  -works. We'll study the self-attention mechanism as   - -17 -00:01:36,640 --> 00:01:44,000 -well as its bi-directional properties. The decoder  -is similar to the encoder: it can also accept   - -18 -00:01:44,000 --> 00:01:47,200 -the same inputs as the encoder: inputs that  -represent text. It uses a similar mechanism as   - -19 -00:01:47,200 --> 00:01:53,200 -the encoder, which is the masked self-attention  -as well. It differs from the encoder due to its   - -20 -00:01:53,200 --> 00:01:59,200 -uni-directional property, and is traditionally  -used in an auto-regressive manner. Here too,   - -21 -00:01:59,200 --> 00:02:03,600 -we recommend you check out the video on decoders  -especially to understand how all of this works.   - -22 -00:02:06,560 --> 00:02:11,120 -Combining the two parts results in what is known  -as an encoder-decoder, or a sequence-to-sequence   - -23 -00:02:11,120 --> 00:02:16,640 -transformer. The encoder accepts inputs and  -computes a high-level representation of those   - -24 -00:02:16,640 --> 00:02:22,640 -inputs. These outputs are then passed to the  -decoder. The decoder uses the encoder's output   - -25 -00:02:22,640 --> 00:02:27,680 -alongside other inputs, in order to generate  -a prediction. It then predicts an output,   - -26 -00:02:27,680 --> 00:02:32,000 -which it will re-use in future iterations,  -hence the term "auto-regressive".   - -27 -00:02:33,040 --> 00:02:36,480 -Finally, to get an understanding  -of the encoder-decoders as a whole,   - -28 -00:02:36,480 --> 00:02:44,080 -we recommend you check out  -the video on encoder-decoders. +1 +00:00:00,000 --> 00:00:02,750 +(logo whooshing) + +2 +00:00:05,010 --> 00:00:07,323 +- Let's study the +transformer architecture. + +3 +00:00:09,150 --> 00:00:12,030 +This video is the introductory +video to the encoders, + +4 +00:00:12,030 --> 00:00:15,510 +decoders, and encoder-decoder +series of videos. + +5 +00:00:15,510 --> 00:00:16,343 +In this series, + +6 +00:00:16,343 --> 00:00:18,900 +we'll try to understand what +makes a transformer network, + +7 +00:00:18,900 --> 00:00:22,770 +and we'll try to explain it +in simple, high-level terms. + +8 +00:00:22,770 --> 00:00:25,800 +No advanced understanding of +neural networks is necessary, + +9 +00:00:25,800 --> 00:00:29,343 +but an understanding of basic +vectors and tensors may help. + +10 +00:00:32,250 --> 00:00:33,270 +To get started, + +11 +00:00:33,270 --> 00:00:34,530 +we'll take up this diagram + +12 +00:00:34,530 --> 00:00:36,630 +from the original transformer paper, + +13 +00:00:36,630 --> 00:00:40,140 +entitled "Attention Is All +You Need", by Vaswani et al. + +14 +00:00:40,140 --> 00:00:41,010 +As we'll see here, + +15 +00:00:41,010 --> 00:00:42,780 +we can leverage only some parts of it, + +16 +00:00:42,780 --> 00:00:44,630 +according to what we're trying to do. + +17 +00:00:45,480 --> 00:00:47,610 +We want to dive into the specific layers, + +18 +00:00:47,610 --> 00:00:48,990 +building up that architecture, + +19 +00:00:48,990 --> 00:00:51,390 +but we'll try to understand +the different ways + +20 +00:00:51,390 --> 00:00:52,893 +this architecture can be used. + +21 +00:00:55,170 --> 00:00:56,003 +Let's first start + +22 +00:00:56,003 --> 00:00:58,260 +by splitting that +architecture into two parts. + +23 +00:00:58,260 --> 00:00:59,910 +On the left we have the encoder, + +24 +00:00:59,910 --> 00:01:01,980 +and on the right, the decoder. + +25 +00:01:01,980 --> 00:01:03,330 +These two can be used together, + +26 +00:01:03,330 --> 00:01:05,330 +but they can also be used independently. + +27 +00:01:06,180 --> 00:01:08,610 +Let's understand how these work. + +28 +00:01:08,610 --> 00:01:11,460 +The encoder accepts inputs +that represent text. + +29 +00:01:11,460 --> 00:01:13,620 +It converts this text, these words, + +30 +00:01:13,620 --> 00:01:15,675 +into numerical representations. + +31 +00:01:15,675 --> 00:01:17,400 +These numerical representations + +32 +00:01:17,400 --> 00:01:20,460 +can also be called +embeddings, or features. + +33 +00:01:20,460 --> 00:01:23,100 +We'll see that it uses the +self-attention mechanism + +34 +00:01:23,100 --> 00:01:24,483 +as its main component. + +35 +00:01:25,500 --> 00:01:27,120 +We recommend you check out the video + +36 +00:01:27,120 --> 00:01:29,700 +on encoders specifically to understand + +37 +00:01:29,700 --> 00:01:31,680 +what is this numerical representation, + +38 +00:01:31,680 --> 00:01:33,690 +as well as how it works. + +39 +00:01:33,690 --> 00:01:36,660 +We'll study the self-attention +mechanism in more detail, + +40 +00:01:36,660 --> 00:01:38,913 +as well as its bi-directional properties. + +41 +00:01:40,650 --> 00:01:42,780 +The decoder is similar to the encoder. + +42 +00:01:42,780 --> 00:01:45,630 +It can also accept text inputs. + +43 +00:01:45,630 --> 00:01:48,210 +It uses a similar +mechanism as the encoder, + +44 +00:01:48,210 --> 00:01:51,150 +which is the masked +self-attention as well. + +45 +00:01:51,150 --> 00:01:52,590 +It differs from the encoder + +46 +00:01:52,590 --> 00:01:54,990 +due to its uni-directional feature + +47 +00:01:54,990 --> 00:01:58,590 +and is traditionally used in +an auto-regressive manner. + +48 +00:01:58,590 --> 00:02:01,650 +Here too, we recommend you +check out the video on decoders + +49 +00:02:01,650 --> 00:02:04,000 +especially to understand +how all of this works. + +50 +00:02:06,810 --> 00:02:07,890 +Combining the two parts + +51 +00:02:07,890 --> 00:02:10,200 +results in what is known +as an encoder-decoder, + +52 +00:02:10,200 --> 00:02:12,720 +or a sequence-to-sequence transformer. + +53 +00:02:12,720 --> 00:02:14,280 +The encoder accepts inputs + +54 +00:02:14,280 --> 00:02:17,850 +and computes a high-level +representation of those inputs. + +55 +00:02:17,850 --> 00:02:20,252 +These outputs are then +passed to the decoder. + +56 +00:02:20,252 --> 00:02:22,860 +The decoder uses the encoder's output, + +57 +00:02:22,860 --> 00:02:26,370 +alongside other inputs +to generate a prediction. + +58 +00:02:26,370 --> 00:02:27,900 +It then predicts an output, + +59 +00:02:27,900 --> 00:02:30,248 +which it will re-use in future iterations, + +60 +00:02:30,248 --> 00:02:32,662 +hence the term, auto-regressive. + +61 +00:02:32,662 --> 00:02:34,740 +Finally, to get an understanding + +62 +00:02:34,740 --> 00:02:36,690 +of the encoder-decoders as a whole, + +63 +00:02:36,690 --> 00:02:39,670 +we recommend you check out +the video on encoder-decoders. + +64 +00:02:39,670 --> 00:02:42,420 +(logo whooshing) + diff --git a/subtitles/en/05_transformer-models-encoders.srt b/subtitles/en/05_transformer-models-encoders.srt index b438afdbe..1171958b2 100644 --- a/subtitles/en/05_transformer-models-encoders.srt +++ b/subtitles/en/05_transformer-models-encoders.srt @@ -1,224 +1,454 @@ -1 -00:00:04,320 --> 00:00:09,120 -In this video, we'll study the encoder  -architecture. An example of a popular   - -2 -00:00:09,120 --> 00:00:13,120 -encoder-only architecture is BERT, which  -is the most popular model of its kind.   - -3 -00:00:14,400 --> 00:00:20,880 -Let's first start by understanding how it works.  -We'll use a small example, using three words. We   - -4 -00:00:20,880 --> 00:00:27,040 -use these as inputs, and pass them through the  -encoder. We retrieve a numerical representation   - -5 -00:00:27,040 --> 00:00:34,160 -of each word. Here, for example, the encoder  -converts the three words “Welcome to NYC”   - -6 -00:00:34,160 --> 00:00:40,880 -in these three sequences of numbers. The encoder  -outputs exactly one sequence of numbers per input   - -7 -00:00:40,880 --> 00:00:46,880 -word. This numerical representation can also be  -called a "Feature vector", or "Feature tensor".  - -8 -00:00:48,880 --> 00:00:53,680 -Let's dive in this representation. It contains  -one vector per word that was passed through the   - -9 -00:00:53,680 --> 00:00:59,680 -encoder. Each of these vector is a numerical  -representation of the word in question.   - -10 -00:01:00,880 --> 00:01:06,400 -The dimension of that vector is defined by the  -architecture of the model, for the base BERT   - -11 -00:01:06,400 --> 00:01:15,280 -model, it is 768. These representations contain  -the value of a word; but contextualized. For   - -12 -00:01:15,280 --> 00:01:21,280 -example, the vector attributed to the word "to",  -isn't the representation of only the "to" word.   - -13 -00:01:22,160 --> 00:01:29,680 -It also takes into account the words around it,  -which we call the “context”.As in, it looks to the   - -14 -00:01:29,680 --> 00:01:34,960 -left context, the word on the left of the one  -we're studying (here the word "Welcome") and   - -15 -00:01:34,960 --> 00:01:41,120 -the context on the right (here the word "NYC") and  -outputs a value for the word, within its context.   - -16 -00:01:41,840 --> 00:01:49,280 -It is therefore a contextualized value. One  -could say that the vector of 768 values holds the   - -17 -00:01:49,280 --> 00:01:55,840 -"meaning" of that word in the text. How it does  -this is thanks to the self-attention mechanism.   - -18 -00:01:57,120 --> 00:02:02,240 -The self-attention mechanism relates to different  -positions (or different words) in a single   - -19 -00:02:02,240 --> 00:02:08,320 -sequence, in order to compute a representation  -of that sequence. As we've seen before, this   - -20 -00:02:08,320 --> 00:02:13,600 -means that the resulting representation of a word  -has been affected by other words in the sequence.   - -21 -00:02:15,600 --> 00:02:20,160 -We won't dive into the specifics here, but we'll  -offer some further readings if you want to get   - -22 -00:02:20,160 --> 00:02:26,480 -a better understanding at what happens under  -the hood. So when should one use an encoder?   - -23 -00:02:27,040 --> 00:02:33,680 -Encoders can be used as standalone models in a  -wide variety of tasks. For example BERT, arguably   - -24 -00:02:33,680 --> 00:02:38,800 -the most famous transformer model, is a standalone  -encoder model and at the time of release,   - -25 -00:02:38,800 --> 00:02:44,000 -beat the state of the art in many sequence  -classification tasks, question answering tasks,   - -26 -00:02:44,000 --> 00:02:50,240 -and masked language modeling, to only cite a  -few. The idea is that encoders are very powerful   - -27 -00:02:50,240 --> 00:02:55,920 -at extracting vectors that carry meaningful  -information about a sequence. This vector can   - -28 -00:02:55,920 --> 00:02:59,680 -then be handled down the road by additional  -layers of neurons to make sense of them.   - -29 -00:03:01,200 --> 00:03:04,240 -Let's take a look at some examples  -where encoders really shine.   - -30 -00:03:06,080 --> 00:03:11,760 -First of all, Masked Language Modeling, or  -MLM. It's the task of predicting a hidden word   - -31 -00:03:11,760 --> 00:03:18,560 -in a sequence of words. Here, for example, we have  -hidden the word between "My" and "is". This is one   - -32 -00:03:18,560 --> 00:03:24,000 -of the objectives with which BERT was trained: it  -was trained to predict hidden words in a sequence.   - -33 -00:03:25,040 --> 00:03:30,160 -Encoders shine in this scenario in particular,  -as bidirectional information is crucial here.   - -34 -00:03:30,960 --> 00:03:35,520 -If we didn't have the words on the right (is,  -Sylvain, and the dot), then there is very little   - -35 -00:03:35,520 --> 00:03:41,200 -chance that BERT would have been able to identify  -"name" as the correct word. The encoder needs to   - -36 -00:03:41,200 --> 00:03:46,720 -have a good understanding of the sequence in order  -to predict a masked word, as even if the text is   - -37 -00:03:46,720 --> 00:03:52,080 -grammatically correct, It does not necessarily  -make sense in the context of the sequence.   - -38 -00:03:54,960 --> 00:03:58,720 -As mentioned earlier, encoders are  -good at doing sequence classification.   - -39 -00:03:59,360 --> 00:04:03,560 -Sentiment analysis is an example  -of a sequence classification task.   - -40 -00:04:04,240 --> 00:04:11,040 -The model's aim is to identify the sentiment of  -a sequence – it can range from giving a sequence   - -41 -00:04:11,040 --> 00:04:16,720 -a rating from one to five stars if doing review  -analysis, to giving a positive or negative rating   - -42 -00:04:16,720 --> 00:04:22,800 -to a sequence, which is what is shown here.  -For example here, given the two sequences,   - -43 -00:04:22,800 --> 00:04:28,800 -we use the model to compute a prediction and to  -classify the sequences among these two classes:   - -44 -00:04:28,800 --> 00:04:35,040 -positive and negative. While the two sequences  -are very similar, containing the same words,   - -45 -00:04:35,040 --> 00:04:41,840 -the meaning is different – and the encoder  -model is able to grasp that difference. +1 +00:00:00,253 --> 00:00:03,003 +(intro striking) + +2 +00:00:04,440 --> 00:00:07,830 +- In this video, we'll study +the encoder architecture. + +3 +00:00:07,830 --> 00:00:11,070 +An example of a popular encoder +only architecture is BURT + +4 +00:00:11,070 --> 00:00:13,323 +which is the most popular +model of its kind. + +5 +00:00:14,550 --> 00:00:16,950 +Let's first start by +understanding how it works. + +6 +00:00:18,360 --> 00:00:20,910 +We'll use a small example +using three words. + +7 +00:00:20,910 --> 00:00:23,823 +We use these as inputs and +pass them through the encoder. + +8 +00:00:25,290 --> 00:00:28,173 +We retrieve a numerical +representation of each word. + +9 +00:00:29,970 --> 00:00:32,700 +Here, for example, the encoder +converts those three words, + +10 +00:00:32,700 --> 00:00:37,350 +welcome to NYC, in these +three sequences of numbers. + +11 +00:00:37,350 --> 00:00:40,350 +The encoder outputs exactly +one sequence of numbers + +12 +00:00:40,350 --> 00:00:41,493 +per input word. + +13 +00:00:42,330 --> 00:00:44,880 +This numerical representation +can also be called + +14 +00:00:44,880 --> 00:00:47,163 +a feature vector, or a feature tensor. + +15 +00:00:49,080 --> 00:00:51,030 +Let's dive into this representation. + +16 +00:00:51,030 --> 00:00:52,740 +It contains one vector per word + +17 +00:00:52,740 --> 00:00:54,540 +that was passed through the encoder. + +18 +00:00:56,130 --> 00:00:58,620 +Each of these vector is a +numerical representation + +19 +00:00:58,620 --> 00:01:00,033 +of the word in question. + +20 +00:01:01,080 --> 00:01:03,300 +The dimension of that vector is defined + +21 +00:01:03,300 --> 00:01:05,520 +by the architecture of the model. + +22 +00:01:05,520 --> 00:01:08,703 +For the base BERT model, it is 768. + +23 +00:01:10,650 --> 00:01:13,230 +These representations +contain the value of a word, + +24 +00:01:13,230 --> 00:01:15,240 +but contextualized. + +25 +00:01:15,240 --> 00:01:18,570 +For example, the vector +attributed to the word "to" + +26 +00:01:18,570 --> 00:01:22,290 +isn't the representation +of only the "to" word. + +27 +00:01:22,290 --> 00:01:25,650 +It also takes into account +the words around it + +28 +00:01:25,650 --> 00:01:27,363 +which we call the context. + +29 +00:01:28,650 --> 00:01:30,780 +As in it looks to the left context, + +30 +00:01:30,780 --> 00:01:32,970 +the words on the left of +the one we're studying, + +31 +00:01:32,970 --> 00:01:34,980 +here the word "Welcome", + +32 +00:01:34,980 --> 00:01:37,497 +and the context on the +right, here the word "NYC", + +33 +00:01:38,348 --> 00:01:42,000 +and it outputs a value for +the word given its context. + +34 +00:01:42,000 --> 00:01:45,420 +It is therefore a contextualized value. + +35 +00:01:45,420 --> 00:01:48,810 +One could say that the +vector of 768 values + +36 +00:01:48,810 --> 00:01:51,993 +holds the meaning of the +word within the text. + +37 +00:01:53,310 --> 00:01:56,073 +It does this thanks to the +self-attention mechanism. + +38 +00:01:57,240 --> 00:02:00,630 +The self-attention mechanism +relates to different positions, + +39 +00:02:00,630 --> 00:02:02,850 +or different words in a single sequence + +40 +00:02:02,850 --> 00:02:06,003 +in order to compute a +representation of that sequence. + +41 +00:02:07,200 --> 00:02:09,000 +As we've seen before, this means that + +42 +00:02:09,000 --> 00:02:11,130 +the resulting representation of a word + +43 +00:02:11,130 --> 00:02:13,983 +has been affected by other +words in the sequence. + +44 +00:02:15,840 --> 00:02:18,030 +We won't dive into the specifics here + +45 +00:02:18,030 --> 00:02:19,680 +which will offer some further readings + +46 +00:02:19,680 --> 00:02:21,330 +if you want to get a better understanding + +47 +00:02:21,330 --> 00:02:22,953 +at what happens under the hood. + +48 +00:02:25,050 --> 00:02:27,480 +So why should one use and encoder? + +49 +00:02:27,480 --> 00:02:29,370 +Encoders can be used as stand-alone models + +50 +00:02:29,370 --> 00:02:31,263 +in a wide variety of tasks. + +51 +00:02:32,100 --> 00:02:33,360 +For example, BERT, + +52 +00:02:33,360 --> 00:02:35,670 +arguably the most famous +transformer model, + +53 +00:02:35,670 --> 00:02:37,590 +is a standalone encoder model, + +54 +00:02:37,590 --> 00:02:38,820 +and at the time of release, + +55 +00:02:38,820 --> 00:02:40,440 +it'd be the state of the art + +56 +00:02:40,440 --> 00:02:42,780 +in many sequence classification tasks, + +57 +00:02:42,780 --> 00:02:44,190 +question answering tasks, + +58 +00:02:44,190 --> 00:02:46,743 +and mask language modeling +to only cite of few. + +59 +00:02:48,150 --> 00:02:50,460 +The idea is that encoders +are very powerful + +60 +00:02:50,460 --> 00:02:52,470 +at extracting vectors that carry + +61 +00:02:52,470 --> 00:02:55,350 +meaningful information about a sequence. + +62 +00:02:55,350 --> 00:02:57,870 +This vector can then be +handled down the road + +63 +00:02:57,870 --> 00:03:00,070 +by additional neurons +to make sense of them. + +64 +00:03:01,380 --> 00:03:02,850 +Let's take a look at some examples + +65 +00:03:02,850 --> 00:03:04,563 +where encoder really shine. + +66 +00:03:06,210 --> 00:03:09,900 +First of all, Masked +Language Modeling, or MLM. + +67 +00:03:09,900 --> 00:03:11,970 +It's the task of predicting a hidden word + +68 +00:03:11,970 --> 00:03:13,590 +in a sequence of word. + +69 +00:03:13,590 --> 00:03:15,630 +Here, for example, we have hidden the word + +70 +00:03:15,630 --> 00:03:17,247 +between "My" and "is". + +71 +00:03:18,270 --> 00:03:21,120 +This is one of the objectives +with which BERT was trained. + +72 +00:03:21,120 --> 00:03:24,393 +It was trained to predict +hidden words in a sequence. + +73 +00:03:25,230 --> 00:03:27,930 +Encoders shine in this +scenario in particular + +74 +00:03:27,930 --> 00:03:31,140 +as bi-directional +information is crucial here. + +75 +00:03:31,140 --> 00:03:32,947 +If we didn't have the words on the right, + +76 +00:03:32,947 --> 00:03:34,650 +"is", "Sylvain" and the ".", + +77 +00:03:34,650 --> 00:03:35,940 +then there is very little chance + +78 +00:03:35,940 --> 00:03:38,580 +that BERT would have been +able to identify name + +79 +00:03:38,580 --> 00:03:40,500 +as the correct word. + +80 +00:03:40,500 --> 00:03:42,270 +The encoder needs to +have a good understanding + +81 +00:03:42,270 --> 00:03:45,360 +of the sequence in order +to predict a masked word + +82 +00:03:45,360 --> 00:03:48,840 +as even if the text is +grammatically correct, + +83 +00:03:48,840 --> 00:03:50,610 +it does not necessarily make sense + +84 +00:03:50,610 --> 00:03:52,413 +in the context of the sequence. + +85 +00:03:55,230 --> 00:03:56,580 +As mentioned earlier, + +86 +00:03:56,580 --> 00:03:59,520 +encoders are good at doing +sequence classification. + +87 +00:03:59,520 --> 00:04:02,883 +Sentiment analysis is an example +of sequence classification. + +88 +00:04:04,410 --> 00:04:09,410 +The model's aim is to identify +the sentiment of a sequence. + +89 +00:04:09,540 --> 00:04:11,280 +It can range from giving a sequence, + +90 +00:04:11,280 --> 00:04:12,960 +a rating from one to five stars + +91 +00:04:12,960 --> 00:04:15,900 +if doing review analysis +to giving a positive, + +92 +00:04:15,900 --> 00:04:17,820 +or negative rating to a sequence + +93 +00:04:17,820 --> 00:04:19,220 +which is what is shown here. + +94 +00:04:20,280 --> 00:04:22,950 +For example, here, +given the two sequences, + +95 +00:04:22,950 --> 00:04:25,860 +we use the model to compute a prediction, + +96 +00:04:25,860 --> 00:04:27,420 +and to classify the sequences + +97 +00:04:27,420 --> 00:04:30,393 +among these two classes, +positive and negative. + +98 +00:04:31,230 --> 00:04:33,450 +While the two sequences are very similar + +99 +00:04:33,450 --> 00:04:35,220 +containing the same words, + +100 +00:04:35,220 --> 00:04:37,170 +the meaning is entirely different, + +101 +00:04:37,170 --> 00:04:40,143 +and the encoder model is able +to grasp that difference. + +102 +00:04:41,404 --> 00:04:44,154 +(outro striking) + diff --git a/subtitles/en/06_transformer-models-decoders.srt b/subtitles/en/06_transformer-models-decoders.srt index af7fa0170..0b0c84cde 100644 --- a/subtitles/en/06_transformer-models-decoders.srt +++ b/subtitles/en/06_transformer-models-decoders.srt @@ -1,263 +1,395 @@ -1 -00:00:03,860 --> 00:00:09,750 -In this video, we'll study the decoder architecture. -An example of a popular decoder-only architecture - -2 -00:00:09,750 --> 00:00:15,809 -is GPT-2. In order to understand how decoders -work, we recommend taking a look at the video - -3 -00:00:15,809 --> 00:00:21,640 -regarding encoders: they're extremely similar -to decoders. One can use a decoder for most - -4 -00:00:21,640 --> 00:00:26,429 -of the same tasks as an encoder, albeit with, -generally, a little loss of performance. Let's - -5 -00:00:26,429 --> 00:00:31,769 -take the same approach we have taken with -the encoder to try and understand the architectural - -6 -00:00:31,769 --> 00:00:38,969 -differences between an encoder and a decoder. -We'll use a small example, using three words. - -7 -00:00:38,969 --> 00:00:46,550 -We pass them through the decoder. We retrieve -a numerical representation of each word. Here, - -8 -00:00:46,550 --> 00:00:51,739 -for example, the decoder converts the three -words “Welcome to NYC” in these three - -9 -00:00:51,739 --> 00:00:57,750 -sequences of numbers. The decoder outputs -exactly one sequence of numbers per input - -10 -00:00:57,750 --> 00:01:03,290 -word. This numerical representation can also -be called a "Feature vector", or "Feature - -11 -00:01:03,290 --> 00:01:09,590 -tensor". Let's dive in this representation. -It contains one vector per word that was passed - -12 -00:01:09,590 --> 00:01:14,830 -through the decoder. Each of these vector -is a numerical representation of the word - -13 -00:01:14,830 --> 00:01:21,810 -in question. The dimension of that vector -is defined by the architecture of the model. - -14 -00:01:21,810 --> 00:01:28,400 -Where the decoder differs from the encoder -is principally with its self-attention mechanism. - -15 -00:01:28,400 --> 00:01:34,090 -It's using what is called "masked self-attention". -Here for example, if we focus on the word - -16 -00:01:34,090 --> 00:01:40,170 -"to", we'll see that its vector is absolutely -unmodified by the "NYC" word. That's because - -17 -00:01:40,170 --> 00:01:45,560 -all the words on the right (also known as -the right context) of the word is masked. - -18 -00:01:45,560 --> 00:01:50,729 -Rather than benefitting from all the words -on the left and right, I.e., the bidirectional - -19 -00:01:50,729 --> 00:02:01,229 -context, decoders only have access to the -words on their left. The masked self-attention - -20 -00:02:01,229 --> 00:02:06,310 -mechanism differs from the self-attention -mechanism by using an additional mask to hide - -21 -00:02:06,310 --> 00:02:12,110 -the context on either side of the word: the -word's numerical representation will not be - -22 -00:02:12,110 --> 00:02:18,730 -affected by the words in the hidden context. -So when should one use a decoder? Decoders, - -23 -00:02:18,730 --> 00:02:24,610 -like encoders, can be used as standalone models. -As they generate a numerical representation, - -24 -00:02:24,610 --> 00:02:30,410 -they can also be used in a wide variety of -tasks. However, the strength of a decoder - -25 -00:02:30,410 --> 00:02:35,420 -lies in the way a word has access to its left -context. The decoders, having only access - -26 -00:02:35,420 --> 00:02:40,280 -to their left context, are inherently good -at text generation: the ability to generate - -27 -00:02:40,280 --> 00:02:46,120 -a word, or a sequence of words, given a known -sequence of words. In NLP, this is known as - -28 -00:02:46,120 --> 00:02:52,150 -Causal Language Modeling. Let's look at an -example. Here's an example of how causal language - -29 -00:02:52,150 --> 00:02:59,240 -modeling works: we start with an initial word, -which is "My". We use this as input for the - -30 -00:02:59,240 --> 00:03:06,330 -decoder. The model outputs a vectors of dimension -768. This vector contains information about - -31 -00:03:06,330 --> 00:03:11,650 -the sequence, which is here a single word, -or word. We apply a small transformation to - -32 -00:03:11,650 --> 00:03:17,019 -that vector so that it maps to all the words -known by the model (mapping which we'll see - -33 -00:03:17,019 --> 00:03:22,650 -later, called a language modeling head). We -identify that the model believes the most - -34 -00:03:22,650 --> 00:03:29,720 -probable following word is "name". We then -take that new word, and add it to the initial - -35 -00:03:29,720 --> 00:03:35,560 -sequence. From "My", we are now at "My name". -This is where the "autoregressive" aspect - -36 -00:03:35,560 --> 00:03:42,689 -comes in. Auto-regressive models re-use their -past outputs as inputs in the following steps. - -37 -00:03:42,689 --> 00:03:49,280 -Once again, we do that the exact same operation: -we cast that sequence through the decoder, - -38 -00:03:49,280 --> 00:03:57,459 -and retrieve the most probable following word. -In this case, it is the word "is". We repeat - -39 -00:03:57,459 --> 00:04:03,049 -the operation until we're satisfied. Starting -from a single word, we've now generated a - -40 -00:04:03,049 --> 00:04:08,870 -full sentence. We decide to stop there, but -we could continue for a while; GPT-2, for - -41 -00:04:08,870 --> 00:04:16,919 -example, has a maximum context size of 1024. -We could eventually generate up to 1024 words, - -42 -00:04:16,919 --> 00:04:20,125 -and the decoder would still have some memory -of the first words of the sequence! If we - -43 -00:04:20,125 --> 00:04:21,125 -go back several levels higher, back to the -full transformer model, we can see what we - -44 -00:04:21,125 --> 00:04:22,125 -learned about the decoder part of the full -transformer model. It is what we call, auto-regressive: - -45 -00:04:22,125 --> 00:04:23,125 -it outputs values that are then used as its -input values. We repeat this operations as - -46 -00:04:23,125 --> 00:04:24,125 -we like. It is based off of the masked self-attention -layer, which allows to have word embeddings - -47 -00:04:24,125 --> 00:04:25,125 -which have access to the context on the left -side of the word. If you look at the diagram - -48 -00:04:25,125 --> 00:04:26,125 -however, you'll see that we haven't seen one -of the aspects of the decoder. That is: cross-attention. - -49 -00:04:26,125 --> 00:04:27,125 -There is a second aspect we haven't seen, -which is it's ability to convert features - -50 -00:04:27,125 --> 00:04:28,125 -to words; heavily linked to the cross attention -mechanism. However, these only apply in the - -51 -00:04:28,125 --> 00:04:29,125 -"encoder-decoder" transformer, or the "sequence-to-sequence" -transformer (which can generally be used interchangeably). - -52 -00:04:29,125 --> 00:04:30,125 -We recommend you check out the video on encoder-decoders -to get an idea of how the decoder can be used - -53 -00:04:30,125 --> 00:04:30,132 -as a component of a larger architecture! +1 +00:00:03,750 --> 00:00:07,140 +- In this video, we'll study +the decoder architecture. + +2 +00:00:07,140 --> 00:00:07,973 +An example + +3 +00:00:07,973 --> 00:00:11,338 +of a popular decoder only +architecture is GPT two. + +4 +00:00:11,338 --> 00:00:14,160 +In order to understand how decoders work + +5 +00:00:14,160 --> 00:00:17,430 +we recommend taking a look at +the video regarding encoders. + +6 +00:00:17,430 --> 00:00:19,980 +They're extremely similar to decoders. + +7 +00:00:19,980 --> 00:00:21,210 +One can use a decoder + +8 +00:00:21,210 --> 00:00:23,760 +for most of the same tasks as an encoder + +9 +00:00:23,760 --> 00:00:27,330 +albeit with generally a +little loss of performance. + +10 +00:00:27,330 --> 00:00:28,890 +Let's take the same approach we have taken + +11 +00:00:28,890 --> 00:00:30,300 +with the encoder to try + +12 +00:00:30,300 --> 00:00:32,670 +and understand the +architectural differences + +13 +00:00:32,670 --> 00:00:34,803 +between an encoder and decoder. + +14 +00:00:35,777 --> 00:00:38,910 +We'll use a small example +using three words. + +15 +00:00:38,910 --> 00:00:41,050 +We pass them through their decoder. + +16 +00:00:41,050 --> 00:00:44,793 +We retrieve a numerical +representation for each word. + +17 +00:00:46,410 --> 00:00:49,350 +Here for example, the decoder +converts the three words. + +18 +00:00:49,350 --> 00:00:53,545 +Welcome to NYC, and these +three sequences of numbers. + +19 +00:00:53,545 --> 00:00:56,040 +The decoder outputs exactly one sequence + +20 +00:00:56,040 --> 00:00:58,740 +of numbers per input word. + +21 +00:00:58,740 --> 00:01:00,630 +This numerical representation can also + +22 +00:01:00,630 --> 00:01:03,783 +be called a feature vector +or a feature sensor. + +23 +00:01:04,920 --> 00:01:07,200 +Let's dive in this representation. + +24 +00:01:07,200 --> 00:01:08,490 +It contains one vector + +25 +00:01:08,490 --> 00:01:11,340 +per word that was passed +through the decoder. + +26 +00:01:11,340 --> 00:01:14,250 +Each of these vectors is +a numerical representation + +27 +00:01:14,250 --> 00:01:15,573 +of the word in question. + +28 +00:01:16,920 --> 00:01:18,562 +The dimension of that vector is defined + +29 +00:01:18,562 --> 00:01:20,703 +by the architecture of the model. + +30 +00:01:22,860 --> 00:01:26,040 +Where the decoder differs from +the encoder is principally + +31 +00:01:26,040 --> 00:01:28,200 +with its self attention mechanism. + +32 +00:01:28,200 --> 00:01:30,843 +It's using what is called +masked self attention. + +33 +00:01:31,860 --> 00:01:34,650 +Here, for example, if we +focus on the word "to" + +34 +00:01:34,650 --> 00:01:37,620 +we'll see that is vector +is absolutely unmodified + +35 +00:01:37,620 --> 00:01:39,690 +by the NYC word. + +36 +00:01:39,690 --> 00:01:41,731 +That's because all the words +on the right, also known + +37 +00:01:41,731 --> 00:01:45,276 +as the right context of +the word is masked rather + +38 +00:01:45,276 --> 00:01:49,230 +than benefiting from all the +words on the left and right. + +39 +00:01:49,230 --> 00:01:51,600 +So the bidirectional context. + +40 +00:01:51,600 --> 00:01:55,020 +Decoders only have access +to a single context + +41 +00:01:55,020 --> 00:01:58,203 +which can be the left +context or the right context. + +42 +00:01:59,539 --> 00:02:03,356 +The masked self attention +mechanism differs + +43 +00:02:03,356 --> 00:02:04,320 +from the self attention mechanism + +44 +00:02:04,320 --> 00:02:07,110 +by using an additional +mask to hide the context + +45 +00:02:07,110 --> 00:02:09,390 +on either side of the word + +46 +00:02:09,390 --> 00:02:12,810 +the words numerical representation +will not be affected + +47 +00:02:12,810 --> 00:02:14,853 +by the words in the hidden context. + +48 +00:02:16,260 --> 00:02:18,330 +So when should one use a decoder? + +49 +00:02:18,330 --> 00:02:22,380 +Decoders like encoders can +be used as standalone models + +50 +00:02:22,380 --> 00:02:25,020 +as they generate a +numerical representation. + +51 +00:02:25,020 --> 00:02:28,320 +They can also be used in +a wide variety of tasks. + +52 +00:02:28,320 --> 00:02:31,260 +However, the strength of +a decoder lies in the way. + +53 +00:02:31,260 --> 00:02:34,530 +A word can only have +access to its left context + +54 +00:02:34,530 --> 00:02:36,690 +having only access to their left context. + +55 +00:02:36,690 --> 00:02:39,120 +They're inherently good at text generation + +56 +00:02:39,120 --> 00:02:41,010 +the ability to generate a word + +57 +00:02:41,010 --> 00:02:45,000 +or a sequence of words given +a known sequence of words. + +58 +00:02:45,000 --> 00:02:45,833 +This is known + +59 +00:02:45,833 --> 00:02:49,083 +as causal language modeling or +natural language generation. + +60 +00:02:50,430 --> 00:02:53,520 +Here's an example of how +causal language modeling works. + +61 +00:02:53,520 --> 00:02:56,410 +We start with an initial word, which is my + +62 +00:02:57,339 --> 00:02:59,973 +we use this as input for the decoder. + +63 +00:03:00,810 --> 00:03:04,260 +The model outputs a vector of numbers + +64 +00:03:04,260 --> 00:03:07,230 +and this vector contains +information about the sequence + +65 +00:03:07,230 --> 00:03:08,733 +which is here a single word. + +66 +00:03:09,780 --> 00:03:11,430 +We apply a small transformation + +67 +00:03:11,430 --> 00:03:13,110 +to that vector so that it maps + +68 +00:03:13,110 --> 00:03:16,500 +to all the words known by +the model, which is a mapping + +69 +00:03:16,500 --> 00:03:19,890 +that we'll see later called +a language modeling head. + +70 +00:03:19,890 --> 00:03:21,930 +We identify that the model believes + +71 +00:03:21,930 --> 00:03:25,053 +that the most probable +following word is name. + +72 +00:03:26,250 --> 00:03:28,710 +We then take that new word and add it + +73 +00:03:28,710 --> 00:03:33,480 +to the initial sequence from +my, we are now at my name. + +74 +00:03:33,480 --> 00:03:36,870 +This is where the auto +regressive aspect comes in. + +75 +00:03:36,870 --> 00:03:38,490 +Auto regressive models. + +76 +00:03:38,490 --> 00:03:42,513 +We use their past outputs as +inputs and the following steps. + +77 +00:03:43,452 --> 00:03:46,980 +Once again, we do the +exact same operation. + +78 +00:03:46,980 --> 00:03:49,500 +We cast that sequence through the decoder + +79 +00:03:49,500 --> 00:03:51,993 +and retrieve the most +probable following word. + +80 +00:03:52,978 --> 00:03:57,978 +In this case, it is the word +"is", we repeat the operation + +81 +00:03:58,230 --> 00:04:02,040 +until we're satisfied, +starting from a single word. + +82 +00:04:02,040 --> 00:04:04,590 +We've now generated a full sentence. + +83 +00:04:04,590 --> 00:04:07,890 +We decide to stop there, but +we could continue for a while. + +84 +00:04:07,890 --> 00:04:12,890 +GPT two, for example, has a +maximum context size of 1,024. + +85 +00:04:13,170 --> 00:04:16,830 +We could eventually +generate up to a 1,024 words + +86 +00:04:16,830 --> 00:04:19,050 +and the decoder would +still have some memory + +87 +00:04:19,050 --> 00:04:21,003 +of the first words in this sequence. + diff --git a/subtitles/en/07_transformer-models-encoder-decoders.srt b/subtitles/en/07_transformer-models-encoder-decoders.srt index 6abdf581e..e1b47aa21 100644 --- a/subtitles/en/07_transformer-models-encoder-decoders.srt +++ b/subtitles/en/07_transformer-models-encoder-decoders.srt @@ -1,323 +1,621 @@ -1 -00:00:04,160 --> 00:00:07,200 -In this video, we'll study the  -encoder-decoder architecture.   - -2 -00:00:08,160 --> 00:00:16,160 -An example of a popular encoder-decoder model is  -T5. In order to understand how the encoder-decoder   - -3 -00:00:16,160 --> 00:00:21,680 -works, we recommend you check out the videos  -on encoders and decoders as standalone models.   - -4 -00:00:22,400 --> 00:00:30,320 -Understanding how they behave individually will  -help understanding how an encoder-decoder behaves.   - -5 -00:00:30,320 --> 00:00:35,360 -Let's start from what we've seen about the  -encoder. The encoder takes words as inputs,   - -6 -00:00:36,000 --> 00:00:40,640 -casts them through the encoder, and  -retrieves a numerical representation   - -7 -00:00:40,640 --> 00:00:47,360 -for each word cast through it. We now know that  -the numerical representation holds information   - -8 -00:00:47,360 --> 00:00:54,000 -about the meaning of the sequence. Let's put  -this aside and add the decoder to the diagram.   - -9 -00:00:56,480 --> 00:01:00,160 -In this scenario, we're using the decoder  -in a manner that we haven't seen before.   - -10 -00:01:00,720 --> 00:01:07,600 -We're passing the outputs of the encoder directly  -to it! Additionally to the encoder outputs,   - -11 -00:01:07,600 --> 00:01:13,040 -we also give the decoder a sequence. When  -prompting the decoder for an output with no   - -12 -00:01:13,040 --> 00:01:17,360 -initial sequence, we can give it the value  -that indicates the start of a sequence.   - -13 -00:01:18,000 --> 00:01:23,520 -And that's where the encoder-decoder magic  -happens. The encoder accepts a sequence as input.   - -14 -00:01:24,560 --> 00:01:30,480 -It computes a prediction, and outputs a  -numerical representation. Then, it sends   - -15 -00:01:30,480 --> 00:01:38,000 -that over to the decoder. It has, in a sense,  -encoded the sequence. And the decoder, in turn,   - -16 -00:01:38,000 --> 00:01:42,960 -using this input alongside its usual sequence  -input, will take a stab at decoding the sequence.   - -17 -00:01:44,720 --> 00:01:50,400 -The decoder decodes the sequence, and outputs a  -word. As of now, we don't need to make sense of   - -18 -00:01:50,400 --> 00:01:55,440 -that word, but we can understand that the decoder  -is essentially decoding what the encoder has   - -19 -00:01:55,440 --> 00:02:02,160 -output. The "start of sequence word" indicates  -that it should start decoding the sequence.   - -20 -00:02:03,600 --> 00:02:10,240 -Now that we have both the feature vector and  -an initial generated word, we don't need the   - -21 -00:02:10,240 --> 00:02:17,760 -encoder anymore. As we have seen before with the  -decoder, it can act in an auto-regressive manner;   - -22 -00:02:18,640 --> 00:02:24,960 -the word it has just output can now be used  -as an input. This, in combination with the   - -23 -00:02:24,960 --> 00:02:30,800 -numerical representation output by the encoder,  -can now be used to generate a second word.   - -24 -00:02:33,200 --> 00:02:38,880 -Please note that the first word is still here; as  -the model still outputs it. However, it is greyed   - -25 -00:02:38,880 --> 00:02:45,120 -out as we have no need for it anymore. We can  -continue on and on; for example until the decoder   - -26 -00:02:45,120 --> 00:02:50,720 -outputs a value that we consider a "stopping  -value", like a dot, meaning the end of a sequence.   - -27 -00:02:53,440 --> 00:02:58,080 -Here, we've seen the full mechanism of the  -encoder-decoder transformer: let's go over it one   - -28 -00:02:58,080 --> 00:03:05,120 -more time. We have an initial sequence, that is  -sent to the encoder. That encoder output is then   - -29 -00:03:05,120 --> 00:03:12,240 -sent to the decoder, for it to be decoded. While  -we can now discard the encoder after a single use,   - -30 -00:03:12,240 --> 00:03:17,840 -the decoder will be used several times: until  -we have generated every word that we need.   - -31 -00:03:20,000 --> 00:03:25,120 -Let's see a concrete example; with Translation  -Language Modeling; also called transduction;   - -32 -00:03:25,120 --> 00:03:30,800 -the act of translating a sequence. Here, we would  -like to translate this English sequence "Welcome   - -33 -00:03:30,800 --> 00:03:38,400 -to NYC" in French. We're using a transformer model  -that is trained for that task explicitly. We use   - -34 -00:03:38,400 --> 00:03:43,520 -the encoder to create a representation  -of the English sentence. We cast this   - -35 -00:03:43,520 --> 00:03:48,880 -to the decoder and, with the use of the start of  -sequence word, we ask it to output the first word.   - -36 -00:03:50,720 --> 00:03:52,960 -It outputs Bienvenue, which means "Welcome".   - -37 -00:03:55,280 --> 00:04:02,480 -We then use "Bienvenue" as the input sequence for  -the decoder. This, alongside the feature vector,   - -38 -00:04:04,320 --> 00:04:08,480 -allows the decoder to predict the second  -word, "à", which is "to" in English.   - -39 -00:04:10,160 --> 00:04:14,400 -Finally, we ask the decoder to predict  -a third word; it predicts "NYC",   - -40 -00:04:14,400 --> 00:04:20,240 -which is, once again, correct. We've translated  -the sentence! Where the encoder-decoder really   - -41 -00:04:20,240 --> 00:04:24,880 -shines, is that we have an encoder and a  -decoder; which often do not share weights.   - -42 -00:04:27,280 --> 00:04:31,440 -We, therefore, have an entire block (the encoder)  -that can be trained to understand the sequence,   - -43 -00:04:31,440 --> 00:04:36,480 -and extract the relevant information. For the  -translation scenario we've seen earlier, for   - -44 -00:04:36,480 --> 00:04:44,160 -example, this would mean parsing and understanding  -what was said in the English language; extracting   - -45 -00:04:44,160 --> 00:04:49,040 -information from that language, and putting  -all of that in a vector dense in information.   - -46 -00:04:50,880 --> 00:04:57,280 -On the other hand, we have the decoder, whose  -sole purpose is to decode the feature output by   - -47 -00:04:57,280 --> 00:05:03,760 -the encoder. This decoder can be specialized in  -a completely different language, or even modality   - -48 -00:05:03,760 --> 00:05:11,760 -like images or speech. Encoders-decoders  -are special for several reasons. Firstly,   - -49 -00:05:11,760 --> 00:05:17,040 -they're able to manage sequence to sequence  -tasks, like translation that we have just seen.   - -50 -00:05:18,640 --> 00:05:23,880 -Secondly, the weights between the encoder and the  -decoder parts are not necessarily shared. Let's   - -51 -00:05:24,480 --> 00:05:31,200 -take another example of translation. Here we're  -translating "Transformers are powerful" in French.   - -52 -00:05:32,240 --> 00:05:36,560 -Firstly, this means that from a sequence  -of three words, we're able to generate   - -53 -00:05:36,560 --> 00:05:42,240 -a sequence of four words. One could argue  -that this could be handled with a decoder;   - -54 -00:05:42,240 --> 00:05:46,960 -that would generate the translation in an  -auto-regressive manner; and they would be right!   - -55 -00:05:49,840 --> 00:05:53,840 -Another example of where sequence to sequence  -transformers shine is in summarization.   - -56 -00:05:54,640 --> 00:05:58,560 -Here we have a very long  -sequence, generally a full text,   - -57 -00:05:58,560 --> 00:06:03,840 -and we want to summarize it. Since the  -encoder and decoders are separated,   - -58 -00:06:03,840 --> 00:06:08,880 -we can have different context lengths (for  -example a very long context for the encoder which   - -59 -00:06:08,880 --> 00:06:13,840 -handles the text, and a smaller context for the  -decoder which handles the summarized sequence).   - -60 -00:06:16,240 --> 00:06:20,480 -There are a lot of sequence to sequence  -models. This contains a few examples of   - -61 -00:06:20,480 --> 00:06:24,160 -popular encoder-decoder models  -available in the transformers library.   - -62 -00:06:26,320 --> 00:06:31,200 -Additionally, you can load an encoder  -and a decoder inside an encoder-decoder   - -63 -00:06:31,200 --> 00:06:35,040 -model! Therefore, according to the  -specific task you are targeting,   - -64 -00:06:35,040 --> 00:06:40,240 -you may choose to use specific encoders  -and decoders, which have proven their worth   - -65 -00:06:40,240 --> 00:06:49,850 -on these specific tasks. This wraps things up  -for the encoder-decoders. Thanks for watching! +1 +00:00:00,520 --> 00:00:02,603 +(swoosh) + +2 +00:00:04,230 --> 00:00:05,063 +- In this video, + +3 +00:00:05,063 --> 00:00:07,638 +we'll study the +encoder-decoder architecture. + +4 +00:00:07,638 --> 00:00:12,243 +An example of a popular +encoder-decoder model is T5. + +5 +00:00:13,770 --> 00:00:16,980 +In order to understand how +the encoder-decoder works, + +6 +00:00:16,980 --> 00:00:18,630 +we recommend you check out the videos + +7 +00:00:18,630 --> 00:00:22,590 +on encoders and decoders +as standalone models. + +8 +00:00:22,590 --> 00:00:24,990 +Understanding how they work individually + +9 +00:00:24,990 --> 00:00:28,323 +will help understanding how +an encoder-decoder works. + +10 +00:00:30,510 --> 00:00:33,390 +Let's start from what we've +seen about the encoder. + +11 +00:00:33,390 --> 00:00:36,240 +The encoder takes words as inputs, + +12 +00:00:36,240 --> 00:00:38,520 +casts them through the encoder, + +13 +00:00:38,520 --> 00:00:40,800 +and retrieves a numerical representation + +14 +00:00:40,800 --> 00:00:42,663 +for each word cast through it. + +15 +00:00:43,560 --> 00:00:46,470 +We now know that this +numerical representation + +16 +00:00:46,470 --> 00:00:49,473 +holds information about the +meaning of the sequence. + +17 +00:00:51,090 --> 00:00:54,243 +Let's put this aside and add +the decoder to the diagram. + +18 +00:00:56,610 --> 00:00:57,510 +In this scenario, + +19 +00:00:57,510 --> 00:00:59,190 +we're using the decoder in a manner + +20 +00:00:59,190 --> 00:01:00,960 +that we haven't seen before. + +21 +00:01:00,960 --> 00:01:04,173 +We're passing the outputs of +the encoder directly to it. + +22 +00:01:05,356 --> 00:01:07,770 +Additionally to the encoder outputs, + +23 +00:01:07,770 --> 00:01:10,800 +we also give the decoder a sequence. + +24 +00:01:10,800 --> 00:01:12,840 +When prompting the decoder for an output + +25 +00:01:12,840 --> 00:01:14,190 +with no initial sequence, + +26 +00:01:14,190 --> 00:01:16,140 +we can give it the value that indicates + +27 +00:01:16,140 --> 00:01:18,060 +the start of a sequence. + +28 +00:01:18,060 --> 00:01:20,919 +And that's where the +encoder-decoder magic happens. + +29 +00:01:20,919 --> 00:01:24,082 +The encoder accepts a sequence as input. + +30 +00:01:24,082 --> 00:01:25,980 +It computes a prediction, + +31 +00:01:25,980 --> 00:01:28,858 +and outputs a numerical representation. + +32 +00:01:28,858 --> 00:01:33,120 +Then, it sends that over to the decoder. + +33 +00:01:33,120 --> 00:01:36,300 +It has, in a sense, encoded that sequence. + +34 +00:01:36,300 --> 00:01:38,130 +And the decoder, in turn, + +35 +00:01:38,130 --> 00:01:40,847 +using this input alongside +its usual sequence input, + +36 +00:01:40,847 --> 00:01:43,906 +will take a stab at decoding the sequence. + +37 +00:01:43,906 --> 00:01:46,530 +The decoder decodes the sequence, + +38 +00:01:46,530 --> 00:01:48,360 +and outputs a word. + +39 +00:01:48,360 --> 00:01:51,300 +As of now, we don't need +to make sense of that word, + +40 +00:01:51,300 --> 00:01:53,100 +but we can understand that the decoder + +41 +00:01:53,100 --> 00:01:56,103 +is essentially decoding +what the encoder has output. + +42 +00:01:57,008 --> 00:02:00,000 +The start of sequence word here + +43 +00:02:00,000 --> 00:02:02,871 +indicates that it should +start decoding the sequence. + +44 +00:02:02,871 --> 00:02:06,870 +Now that we have both the +encoder numerical representation + +45 +00:02:06,870 --> 00:02:09,570 +and an initial generated word, + +46 +00:02:09,570 --> 00:02:11,343 +we don't need the encoder anymore. + +47 +00:02:12,269 --> 00:02:15,540 +As we have seen before with the decoder, + +48 +00:02:15,540 --> 00:02:18,720 +it can act in an auto-regressive manner. + +49 +00:02:18,720 --> 00:02:22,933 +The word it has just output +can now be used as an input. + +50 +00:02:22,933 --> 00:02:26,188 +This, in combination with +the numerical representation + +51 +00:02:26,188 --> 00:02:28,560 +output by the encoder, + +52 +00:02:28,560 --> 00:02:31,203 +can now be used to generate a second word. + +53 +00:02:33,040 --> 00:02:35,910 +Please note that the +first word is still here, + +54 +00:02:35,910 --> 00:02:37,770 +as the model still outputs it. + +55 +00:02:37,770 --> 00:02:39,240 +However, we have grayed it out + +56 +00:02:39,240 --> 00:02:40,940 +as we have no need for it anymore. + +57 +00:02:41,880 --> 00:02:44,070 +We can continue on and on, for example, + +58 +00:02:44,070 --> 00:02:46,320 +until the decoder outputs a value + +59 +00:02:46,320 --> 00:02:48,540 +that we consider a stopping value, + +60 +00:02:48,540 --> 00:02:51,093 +like a dot meaning the end of a sequence. + +61 +00:02:53,580 --> 00:02:55,926 +Here, we've seen the full mechanism + +62 +00:02:55,926 --> 00:02:57,540 +of the encoder-decoder transformer. + +63 +00:02:57,540 --> 00:02:59,280 +Let's go over it one more time. + +64 +00:02:59,280 --> 00:03:02,773 +We have an initial sequence +that is sent to the encoder. + +65 +00:03:02,773 --> 00:03:06,450 +That encoder output is +then sent to the decoder + +66 +00:03:06,450 --> 00:03:07,563 +for it to be decoded. + +67 +00:03:08,760 --> 00:03:12,450 +While it can now discard the +encoder after a single use, + +68 +00:03:12,450 --> 00:03:14,427 +the decoder will be used several times + +69 +00:03:14,427 --> 00:03:17,763 +until we have generated +every word that we need. + +70 +00:03:19,288 --> 00:03:21,510 +So let's see a concrete example + +71 +00:03:21,510 --> 00:03:23,460 +with Translation Language Modeling. + +72 +00:03:23,460 --> 00:03:24,930 +Also called transduction, + +73 +00:03:24,930 --> 00:03:28,200 +which is the act of +translating a sequence. + +74 +00:03:28,200 --> 00:03:30,577 +Here, we would like to +translate this English sequence + +75 +00:03:30,577 --> 00:03:33,067 +"Welcome to NYC" in French. + +76 +00:03:33,067 --> 00:03:35,460 +We're using a transformer model + +77 +00:03:35,460 --> 00:03:38,070 +that is trained for that task explicitly. + +78 +00:03:38,070 --> 00:03:40,560 +We use the encoder to +create a representation + +79 +00:03:40,560 --> 00:03:42,240 +of the English sentence. + +80 +00:03:42,240 --> 00:03:44,730 +We cast this to the decoder, + +81 +00:03:44,730 --> 00:03:46,620 +with the use of the +start of sequence word, + +82 +00:03:46,620 --> 00:03:49,173 +we ask it to output the first word. + +83 +00:03:50,029 --> 00:03:53,607 +It outputs bienvenue, which means welcome. + +84 +00:03:53,607 --> 00:03:56,640 +And we then use bienvenue + +85 +00:03:56,640 --> 00:03:59,283 +as the input sequence for the decoder. + +86 +00:04:00,188 --> 00:04:04,470 +This, alongside the encoder +numerical representation, + +87 +00:04:04,470 --> 00:04:07,440 +allows the decoder to +predict the second word, Ã, + +88 +00:04:07,440 --> 00:04:09,240 +which is to in English. + +89 +00:04:09,240 --> 00:04:13,590 +Finally, we ask the decoder +to predict a third word + +90 +00:04:13,590 --> 00:04:15,330 +It predicts NYC, which is correct. + +91 +00:04:15,330 --> 00:04:18,288 +We've translated the sentence. + +92 +00:04:18,288 --> 00:04:20,760 +Where the encoder-decoder really shines, + +93 +00:04:20,760 --> 00:04:23,550 +is that we have an encoder and a decoder, + +94 +00:04:23,550 --> 00:04:25,323 +which often do not share weights. + +95 +00:04:26,256 --> 00:04:29,460 +Therefore, we have an +entire block, the encoder, + +96 +00:04:29,460 --> 00:04:31,650 +that can be trained to +understand the sequence + +97 +00:04:31,650 --> 00:04:34,290 +and extract the relevant information. + +98 +00:04:34,290 --> 00:04:36,450 +For the translation +scenario we've seen earlier, + +99 +00:04:36,450 --> 00:04:38,760 +for example, this would mean parsing + +100 +00:04:38,760 --> 00:04:42,003 +and understanding what was +said in the English language. + +101 +00:04:42,900 --> 00:04:45,960 +It would mean extracting +information from that language, + +102 +00:04:45,960 --> 00:04:49,413 +and putting all of that in a +vector dense in information. + +103 +00:04:50,361 --> 00:04:53,370 +On the other hand, we have the decoder, + +104 +00:04:53,370 --> 00:04:56,850 +whose sole purpose is to decode +the numerical representation + +105 +00:04:56,850 --> 00:04:58,203 +output by the encoder. + +106 +00:04:59,460 --> 00:05:01,170 +This decoder can be specialized + +107 +00:05:01,170 --> 00:05:02,970 +in a completely different language, + +108 +00:05:02,970 --> 00:05:05,403 +or even modality like images or speech. + +109 +00:05:07,170 --> 00:05:10,473 +Encoders-decoders are +special for several reasons. + +110 +00:05:11,310 --> 00:05:15,570 +Firstly, they're able to manage +sequence to sequence tasks, + +111 +00:05:15,570 --> 00:05:18,358 +like translation that we have just seen. + +112 +00:05:18,358 --> 00:05:20,940 +Secondly, the weights between the encoder + +113 +00:05:20,940 --> 00:05:24,540 +and the decoder parts are +not necessarily shared. + +114 +00:05:24,540 --> 00:05:27,172 +Let's take another example of translation. + +115 +00:05:27,172 --> 00:05:30,810 +Here we're translating +"Transformers are powerful" + +116 +00:05:30,810 --> 00:05:32,048 +in French. + +117 +00:05:32,048 --> 00:05:35,258 +Firstly, this means that from +a sequence of three words, + +118 +00:05:35,258 --> 00:05:39,030 +we're able to generate a +sequence of four words. + +119 +00:05:39,030 --> 00:05:42,480 +One could argue that this +could be handled with a decoder + +120 +00:05:42,480 --> 00:05:44,160 +that would generate the translation + +121 +00:05:44,160 --> 00:05:46,260 +in an auto-regressive manner, + +122 +00:05:46,260 --> 00:05:47,460 +and they would be right. + +123 +00:05:49,980 --> 00:05:51,930 +Another example of where +sequence to sequence + +124 +00:05:51,930 --> 00:05:54,810 +transformers shine is in summarization. + +125 +00:05:54,810 --> 00:05:58,379 +Here we have a very long +sequence, generally a full text, + +126 +00:05:58,379 --> 00:06:01,020 +and we want to summarize it. + +127 +00:06:01,020 --> 00:06:04,020 +Since the encoder and +decoders are separated, + +128 +00:06:04,020 --> 00:06:06,300 +we can have different context lengths. + +129 +00:06:06,300 --> 00:06:08,910 +For example, a very long +context for the encoder, + +130 +00:06:08,910 --> 00:06:10,230 +which handles the text, + +131 +00:06:10,230 --> 00:06:12,210 +and a smaller context for the decoder + +132 +00:06:12,210 --> 00:06:14,223 +which handles the summarized sequence. + +133 +00:06:16,470 --> 00:06:18,840 +There are a lot of sequence +to sequence models. + +134 +00:06:18,840 --> 00:06:20,310 +This contains a few examples + +135 +00:06:20,310 --> 00:06:22,500 +of popular encoder-decoder models + +136 +00:06:22,500 --> 00:06:24,400 +available in the transformers library. + +137 +00:06:25,829 --> 00:06:29,940 +Additionally, you can load +an encoder and a decoder + +138 +00:06:29,940 --> 00:06:32,130 +inside an encoder-decoder model. + +139 +00:06:32,130 --> 00:06:35,190 +Therefore, according to the +specific task you are targeting, + +140 +00:06:35,190 --> 00:06:38,700 +you may choose to use specific +encoders and decoders, + +141 +00:06:38,700 --> 00:06:42,613 +which have proven their worth +on these specific tasks. + +142 +00:06:42,613 --> 00:06:44,696 +(swoosh) + diff --git a/subtitles/en/08_what-happens-inside-the-pipeline-function-(pytorch).srt b/subtitles/en/08_what-happens-inside-the-pipeline-function-(pytorch).srt index a4e7b1e00..dc405bae7 100644 --- a/subtitles/en/08_what-happens-inside-the-pipeline-function-(pytorch).srt +++ b/subtitles/en/08_what-happens-inside-the-pipeline-function-(pytorch).srt @@ -1,244 +1,471 @@ -1 -00:00:05,200 --> 00:00:09,680 -What happens inside the pipeline  -function? In this video,   - -2 -00:00:09,680 --> 00:00:14,240 -we will look at what actually happens when we use  -the pipeline function of the Transformers library.   - -3 -00:00:14,880 --> 00:00:19,440 -More specifically, we will look at the  -sentiment analysis pipeline, and how it   - -4 -00:00:19,440 --> 00:00:24,960 -went from the two following sentences to the  -positive labels with their respective scores.   - -5 -00:00:26,560 --> 00:00:30,720 -As we have seen in the pipeline presentation,  -there are three stages in the pipeline.   - -6 -00:00:31,520 --> 00:00:35,920 -First, we convert the raw texts to  -numbers the model can make sense of,   - -7 -00:00:35,920 --> 00:00:41,520 -using a tokenizer. Then, those numbers go  -through the model, which outputs logits.   - -8 -00:00:42,640 --> 00:00:47,040 -Finally, the post-processing steps transforms  -those logits into labels and scores.   - -9 -00:00:47,920 --> 00:00:53,440 -Let's look in detail at those three steps, and how  -to replicate them using the Transformers library,   - -10 -00:00:53,440 --> 00:01:01,040 -beginning with the first stage, tokenization. The  -tokenization process has several steps. First,   - -11 -00:01:01,040 --> 00:01:07,360 -the text is split into small chunks called tokens.  -They can be words, parts of words or punctuation   - -12 -00:01:07,360 --> 00:01:14,160 -symbols. Then the tokenizer will had some special  -tokens (if the model expect them). Here the model   - -13 -00:01:14,160 --> 00:01:19,440 -uses expects a CLS token at the beginning and a  -SEP token at the end of the sentence to classify.   - -14 -00:01:20,400 --> 00:01:25,440 -Lastly, the tokenizer matches each token to its  -unique ID in the vocabulary of the pretrained   - -15 -00:01:25,440 --> 00:01:31,360 -model. To load such a tokenizer, the Transformers  -library provides the AutoTokenizer API.   - -16 -00:01:32,400 --> 00:01:36,320 -The most important method of this  -class is from_pretrained, which will   - -17 -00:01:36,320 --> 00:01:41,680 -download and cache the configuration and the  -vocabulary associated to a given checkpoint.   - -18 -00:01:43,040 --> 00:01:48,880 -Here, the checkpoint used by default for the  -sentiment analysis pipeline is distilbert base   - -19 -00:01:48,880 --> 00:01:56,080 -uncased finetuned sst2 english. We instantiate  -a tokenizer associated with that checkpoint,   - -20 -00:01:56,640 --> 00:02:01,920 -then feed it the two sentences. Since those  -two sentences are not of the same size,   - -21 -00:02:01,920 --> 00:02:05,040 -we will need to pad the shortest  -one to be able to build an array.   - -22 -00:02:05,760 --> 00:02:08,240 -This is done by the tokenizer  -with the option padding=True.   - -23 -00:02:09,600 --> 00:02:14,800 -With truncation=True, we ensure that any sentence  -longer than the maximum the model can handle   - -24 -00:02:14,800 --> 00:02:21,840 -is truncated. Lastly, the return_tensors option  -tells the tokenizer to return a PyTorch tensor.   - -25 -00:02:23,040 --> 00:02:29,040 -Looking at the result, we see we have a dictionary  -with two keys. Input IDs contains the IDs of both   - -26 -00:02:29,040 --> 00:02:34,080 -sentences, with 0s where the padding is  -applied. The second key, attention mask,   - -27 -00:02:34,080 --> 00:02:37,840 -indicates where padding has been applied,  -so the model does not pay attention to it.   - -28 -00:02:38,640 --> 00:02:43,040 -This is all what is inside the tokenization  -step. Now let's have a look at the second step,   - -29 -00:02:43,760 --> 00:02:50,560 -the model. As for the tokenizer, there is an  -AutoModel API, with a from_pretrained method.   - -30 -00:02:50,560 --> 00:02:54,720 -It will download and cache the configuration  -of the model as well as the pretrained weights.   - -31 -00:02:55,840 --> 00:03:00,480 -However, the AutoModel API will only  -instantiate the body of the model,   - -32 -00:03:00,480 --> 00:03:05,120 -that is, the part of the model that is  -left once the pretraining head is removed.   - -33 -00:03:05,840 --> 00:03:11,360 -It will output a high-dimensional tensor that is a  -representation of the sentences passed, but which   - -34 -00:03:11,360 --> 00:03:17,200 -is not directly useful for our classification  -problem. Here the tensor has two sentences,   - -35 -00:03:17,200 --> 00:03:25,440 -each of sixteen tokens and the last dimension is  -the hidden size of our model 768. To get an output   - -36 -00:03:25,440 --> 00:03:30,240 -linked to our classification problem, we need to  -use the AutoModelForSequenceClassification class.   - -37 -00:03:30,960 --> 00:03:35,200 -It works exactly as the AutoModel class,  -except that it will build a model with a   - -38 -00:03:35,200 --> 00:03:40,720 -classification head. There is one auto class for  -each common NLP task in the Transformers library.   - -39 -00:03:42,000 --> 00:03:47,600 -Here, after giving our model the two  -sentences, we get a tensor of size two by two:   - -40 -00:03:47,600 --> 00:03:53,680 -one result for each sentence and for each possible  -label. Those outputs are not probabilities yet   - -41 -00:03:53,680 --> 00:03:59,120 -(we can see they don't sum to 1). This is because  -each model of the Transformers library returns   - -42 -00:03:59,120 --> 00:04:05,120 -logits. To make sense of those logits, we need to  -dig into the third and last step of the pipeline:   - -43 -00:04:05,680 --> 00:04:11,840 -post-processing. To convert logits into  -probabilities, we need to apply a SoftMax   - -44 -00:04:11,840 --> 00:04:17,760 -layer to them. As we can see, this transforms  -them into positive numbers that sum up to 1.   - -45 -00:04:18,960 --> 00:04:22,800 -The last step is to know which of those  -corresponds to the positive or the negative label.   - -46 -00:04:23,360 --> 00:04:30,160 -This is given by the id2label field of the  -model config. The first probabilities (index 0)   - -47 -00:04:30,160 --> 00:04:35,360 -correspond to the negative label, and the seconds  -(index 1) correspond to the positive label.   - -48 -00:04:36,240 --> 00:04:40,560 -This is how our classifier built with the  -pipeline function picked those labels and computed   - -49 -00:04:40,560 --> 00:04:52,080 -those scores. Now that you know how each steps  -works, you can easily tweak them to your needs. +1 +00:00:00,554 --> 00:00:03,304 +(logo whooshing) + +2 +00:00:05,340 --> 00:00:07,563 +- What happens inside +the pipeline function? + +3 +00:00:08,760 --> 00:00:11,580 +In this video, we will look +at what actually happens + +4 +00:00:11,580 --> 00:00:13,080 +when we use the pipeline function + +5 +00:00:13,080 --> 00:00:15,090 +of the Transformers library. + +6 +00:00:15,090 --> 00:00:16,860 +More specifically, we will look + +7 +00:00:16,860 --> 00:00:19,200 +at the sentiment analysis pipeline, + +8 +00:00:19,200 --> 00:00:22,020 +and how it went from the +two following sentences, + +9 +00:00:22,020 --> 00:00:23,970 +to the positive and negative labels + +10 +00:00:23,970 --> 00:00:25,420 +with their respective scores. + +11 +00:00:26,760 --> 00:00:29,190 +As we have seen in the +pipeline presentation, + +12 +00:00:29,190 --> 00:00:31,860 +there are three stages in the pipeline. + +13 +00:00:31,860 --> 00:00:34,620 +First, we convert the raw texts to numbers + +14 +00:00:34,620 --> 00:00:37,173 +the model can make sense +of using a tokenizer. + +15 +00:00:38,010 --> 00:00:40,530 +Then those numbers go through the model, + +16 +00:00:40,530 --> 00:00:41,943 +which outputs logits. + +17 +00:00:42,780 --> 00:00:45,600 +Finally, the post-processing +steps transforms + +18 +00:00:45,600 --> 00:00:48,150 +those logits into labels and scores. + +19 +00:00:48,150 --> 00:00:50,700 +Let's look in detail at those three steps + +20 +00:00:50,700 --> 00:00:53,640 +and how to replicate them +using the Transformers library, + +21 +00:00:53,640 --> 00:00:56,043 +beginning with the first +stage, tokenization. + +22 +00:00:57,915 --> 00:01:00,360 +The tokenization process +has several steps. + +23 +00:01:00,360 --> 00:01:04,950 +First, the text is split into +small chunks called tokens. + +24 +00:01:04,950 --> 00:01:08,550 +They can be words, parts of +words or punctuation symbols. + +25 +00:01:08,550 --> 00:01:11,580 +Then the tokenizer will +had some special tokens, + +26 +00:01:11,580 --> 00:01:13,500 +if the model expect them. + +27 +00:01:13,500 --> 00:01:16,860 +Here the model uses expects +a CLS token at the beginning + +28 +00:01:16,860 --> 00:01:19,743 +and a SEP token at the end +of the sentence to classify. + +29 +00:01:20,580 --> 00:01:24,180 +Lastly, the tokenizer matches +each token to its unique ID + +30 +00:01:24,180 --> 00:01:27,000 +in the vocabulary of the pretrained model. + +31 +00:01:27,000 --> 00:01:28,680 +To load such a tokenizer, + +32 +00:01:28,680 --> 00:01:31,743 +the Transformers library +provides the AutoTokenizer API. + +33 +00:01:32,730 --> 00:01:36,120 +The most important method of +this class is from_pretrained, + +34 +00:01:36,120 --> 00:01:38,910 +which will download and +cache the configuration + +35 +00:01:38,910 --> 00:01:41,853 +and the vocabulary associated +to a given checkpoint. + +36 +00:01:43,200 --> 00:01:45,360 +Here the checkpoint used by default + +37 +00:01:45,360 --> 00:01:47,280 +for the sentiment analysis pipeline + +38 +00:01:47,280 --> 00:01:51,986 +is +distilbert-base-uncased-finetuned-sst-2-English. + +39 +00:01:51,986 --> 00:01:53,700 +(indistinct) + +40 +00:01:53,700 --> 00:01:56,490 +We instantiate a tokenizer +associated with that checkpoint, + +41 +00:01:56,490 --> 00:01:59,490 +then feed it the two sentences. + +42 +00:01:59,490 --> 00:02:02,100 +Since those two sentences +are not of the same size, + +43 +00:02:02,100 --> 00:02:03,930 +we will need to pad the shortest one + +44 +00:02:03,930 --> 00:02:06,030 +to be able to build an array. + +45 +00:02:06,030 --> 00:02:09,840 +This is done by the tokenizer +with the option, padding=True. + +46 +00:02:09,840 --> 00:02:12,810 +With truncation=True, we +ensure that any sentence + +47 +00:02:12,810 --> 00:02:15,873 +longer than the maximum the +model can handle is truncated. + +48 +00:02:17,010 --> 00:02:19,620 +Lastly, the return_tensors option + +49 +00:02:19,620 --> 00:02:22,323 +tells the tokenizer to +return a PyTorch tensor. + +50 +00:02:23,190 --> 00:02:25,590 +Looking at the result, we +see we have a dictionary + +51 +00:02:25,590 --> 00:02:26,670 +with two keys. + +52 +00:02:26,670 --> 00:02:29,970 +Input IDs contains the +IDs of both sentences, + +53 +00:02:29,970 --> 00:02:32,550 +with zero where the padding is applied. + +54 +00:02:32,550 --> 00:02:34,260 +The second key, attention mask, + +55 +00:02:34,260 --> 00:02:36,150 +indicates where padding has been applied, + +56 +00:02:36,150 --> 00:02:38,940 +so the model does not pay attention to it. + +57 +00:02:38,940 --> 00:02:42,090 +This is all what is inside +the tokenization step. + +58 +00:02:42,090 --> 00:02:46,289 +Now, let's have a look at +the second step, the model. + +59 +00:02:46,289 --> 00:02:47,952 +As for the tokenizer, + +60 +00:02:47,952 --> 00:02:51,133 +there is an AutoModel API +with a from_pretrained method. + +61 +00:02:51,133 --> 00:02:53,954 +It will download and cache +the configuration of the model + +62 +00:02:53,954 --> 00:02:56,280 +as well as the pretrained weights. + +63 +00:02:56,280 --> 00:02:58,200 +However, the AutoModel API + +64 +00:02:58,200 --> 00:03:00,630 +will only instantiate +the body of the model, + +65 +00:03:00,630 --> 00:03:03,420 +that is the part of the model that is left + +66 +00:03:03,420 --> 00:03:06,090 +once the pretraining head is removed. + +67 +00:03:06,090 --> 00:03:08,610 +It will output a high-dimensional tensor + +68 +00:03:08,610 --> 00:03:11,220 +that is a representation +of the sentences passed, + +69 +00:03:11,220 --> 00:03:12,690 +but which is not directly useful + +70 +00:03:12,690 --> 00:03:15,030 +for our classification problem. + +71 +00:03:15,030 --> 00:03:19,230 +Here the tensor has two +sentences, each of 16 tokens, + +72 +00:03:19,230 --> 00:03:23,433 +and the last dimension is the +hidden size of our model, 768. + +73 +00:03:24,900 --> 00:03:27,510 +To get an output linked to +our classification problem, + +74 +00:03:27,510 --> 00:03:31,170 +we need to use the +AutoModelForSequenceClassification class. + +75 +00:03:31,170 --> 00:03:33,330 +It works exactly as the AutoModel class, + +76 +00:03:33,330 --> 00:03:35,130 +except that it will build a model + +77 +00:03:35,130 --> 00:03:36,543 +with a classification head. + +78 +00:03:37,483 --> 00:03:39,560 +There is one auto class +for each common NLP task + +79 +00:03:39,560 --> 00:03:40,960 +in the Transformers library. + +80 +00:03:42,150 --> 00:03:45,570 +Here after giving our +model the two sentences, + +81 +00:03:45,570 --> 00:03:47,820 +we get a tensor of size two by two, + +82 +00:03:47,820 --> 00:03:50,943 +one result for each sentence +and for each possible label. + +83 +00:03:51,840 --> 00:03:53,970 +Those outputs are not probabilities yet, + +84 +00:03:53,970 --> 00:03:56,100 +we can see they don't sum to 1. + +85 +00:03:56,100 --> 00:03:57,270 +This is because each model + +86 +00:03:57,270 --> 00:04:00,810 +of the Transformers +library returns logits. + +87 +00:04:00,810 --> 00:04:02,250 +To make sense of those logits, + +88 +00:04:02,250 --> 00:04:05,910 +we need to dig into the third +and last step of the pipeline. + +89 +00:04:05,910 --> 00:04:10,620 +Post-processing, to convert +logits into probabilities, + +90 +00:04:10,620 --> 00:04:13,470 +we need to apply a SoftMax layers to them. + +91 +00:04:13,470 --> 00:04:14,610 +As we can see, + +92 +00:04:14,610 --> 00:04:17,267 +this transforms them into positive number + +93 +00:04:17,267 --> 00:04:18,663 +that sum up to one. + +94 +00:04:18,663 --> 00:04:21,360 +The last step is to know +which of those corresponds + +95 +00:04:21,360 --> 00:04:23,580 +to the positive or the negative label. + +96 +00:04:23,580 --> 00:04:28,020 +This is given by the id2label +field of the model config. + +97 +00:04:28,020 --> 00:04:30,390 +The first probabilities, index zero, + +98 +00:04:30,390 --> 00:04:32,250 +correspond to the negative label, + +99 +00:04:32,250 --> 00:04:34,140 +and the seconds, index one, + +100 +00:04:34,140 --> 00:04:36,480 +correspond to the positive label. + +101 +00:04:36,480 --> 00:04:37,950 +This is how our classifier built + +102 +00:04:37,950 --> 00:04:40,230 +with the pipeline function +picked those labels + +103 +00:04:40,230 --> 00:04:42,240 +and computed those scores. + +104 +00:04:42,240 --> 00:04:44,220 +Now that you know how each steps works, + +105 +00:04:44,220 --> 00:04:46,220 +you can easily tweak them to your needs. + +106 +00:04:47,524 --> 00:04:50,274 +(logo whooshing) + diff --git a/subtitles/en/09_what-happens-inside-the-pipeline-function-(tensorflow).srt b/subtitles/en/09_what-happens-inside-the-pipeline-function-(tensorflow).srt index cbff989dc..21c8e3de5 100644 --- a/subtitles/en/09_what-happens-inside-the-pipeline-function-(tensorflow).srt +++ b/subtitles/en/09_what-happens-inside-the-pipeline-function-(tensorflow).srt @@ -1,238 +1,473 @@ -1 -00:00:05,360 --> 00:00:07,680 -What happens inside the pipeline function?   - -2 -00:00:09,840 --> 00:00:14,800 -In this video, we will look at what actually  -happens when we use the pipeline function of   - -3 -00:00:14,800 --> 00:00:20,880 -the Transformers library. More specifically, we  -will look at the sentiment analysis pipeline, and   - -4 -00:00:20,880 --> 00:00:26,720 -how it went from the two following sentences to  -the positive labels with their respective scores.   - -5 -00:00:28,560 --> 00:00:34,160 -As we have seen in the pipeline presentation,  -there are three stages in the pipeline. First,   - -6 -00:00:34,800 --> 00:00:38,880 -we convert the raw texts to numbers the  -model can make sense of, using a tokenizer.   - -7 -00:00:40,000 --> 00:00:43,520 -Then, those numbers go through  -the model, which outputs logits.   - -8 -00:00:44,400 --> 00:00:49,120 -Finally, the post-processing steps transforms  -those logits into labels and scores.   - -9 -00:00:50,720 --> 00:00:54,960 -Let's look in detail at those three steps, and how  -to replicate them using the Transformers library,   - -10 -00:00:54,960 --> 00:01:03,280 -beginning with the first stage, tokenization. The  -tokenization process has several steps. First,   - -11 -00:01:03,280 --> 00:01:09,120 -the text is split into small chunks called tokens.  -They can be words, parts of words or punctuation   - -12 -00:01:09,120 --> 00:01:17,440 -symbols. Then the tokenizer will had some special  -tokens (if the model expect them). Here the model   - -13 -00:01:17,440 --> 00:01:22,800 -uses expects a CLS token at the beginning and a  -SEP token at the end of the sentence to classify.   - -14 -00:01:23,760 --> 00:01:28,880 -Lastly, the tokenizer matches each token to its  -unique ID in the vocabulary of the pretrained   - -15 -00:01:28,880 --> 00:01:34,640 -model. To load such a tokenizer, the Transformers  -library provides the AutoTokenizer API.   - -16 -00:01:35,680 --> 00:01:40,640 -The most important method of this class is  -from_pretrained, which will download and cache   - -17 -00:01:40,640 --> 00:01:47,200 -the configuration and the vocabulary associated  -to a given checkpoint. Here, the checkpoint used   - -18 -00:01:47,200 --> 00:01:53,840 -by default for the sentiment analysis pipeline is  -distilbert base uncased finetuned sst2 english.   - -19 -00:01:56,560 --> 00:02:01,440 -We instantiate a tokenizer associated with that  -checkpoint, then feed it the two sentences.   - -20 -00:02:02,640 --> 00:02:07,360 -Since those two sentences are not of the same  -size, we will need to pad the shortest one to   - -21 -00:02:07,360 --> 00:02:11,680 -be able to build an array. This is done by  -the tokenizer with the option padding=True.   - -22 -00:02:13,840 --> 00:02:18,960 -With truncation=True, we ensure that any sentence  -longer than the maximum the model can handle   - -23 -00:02:18,960 --> 00:02:25,600 -is truncated. Lastly, the return_tensors option  -tells the tokenizer to return a TensorFlow tensor.   - -24 -00:02:26,720 --> 00:02:29,680 -Looking at the result, we see we  -have a dictionary with two keys.   - -25 -00:02:30,240 --> 00:02:37,280 -Input IDs contains the IDs of both sentences, with  -0s where the padding is applied. The second key,   - -26 -00:02:37,280 --> 00:02:42,080 -attention mask, indicates where padding has been  -applied, so the model does not pay attention to   - -27 -00:02:42,080 --> 00:02:48,000 -it. This is all what is inside the tokenization  -step. Now let's have a look at the second step,   - -28 -00:02:48,640 --> 00:02:54,960 -the model. As for the tokenizer, there is an  -TFAutoModel API, with a from_pretrained method.   - -29 -00:02:55,600 --> 00:02:59,840 -It will download and cache the configuration  -of the model as well as the pretrained   - -30 -00:02:59,840 --> 00:03:05,600 -weights. However, the TFAutoModel API will  -only instantiate the body of the model,   - -31 -00:03:06,320 --> 00:03:10,640 -that is, the part of the model that is  -left once the pretraining head is removed.   - -32 -00:03:12,000 --> 00:03:16,960 -It will output a high-dimensional tensor that  -is a representation of the sentences passed,   - -33 -00:03:16,960 --> 00:03:20,080 -but which is not directly useful  -for our classification problem.   - -34 -00:03:21,760 --> 00:03:28,080 -Here the tensor has two sentences, each of sixteen  -tokens and the last dimension is the hidden size   - -35 -00:03:28,080 --> 00:03:34,320 -of our model 768. To get an output linked  -to our classification problem, we need to   - -36 -00:03:34,320 --> 00:03:40,000 -use the TFAutoModelForSequenceClassification  -class. It works exactly as the AutoModel class,   - -37 -00:03:40,000 --> 00:03:45,440 -except that it will build a model with a  -classification head. There is one auto class for   - -38 -00:03:45,440 --> 00:03:52,160 -each common NLP task in the Transformers library.  -Here, after giving our model the two sentences,   - -39 -00:03:52,160 --> 00:03:59,120 -we get a tensor of size two by two: one result for  -each sentence and for each possible label. Those   - -40 -00:03:59,120 --> 00:04:04,800 -outputs are not probabilities yet (we can see they  -don't sum to 1). This is because each model of the   - -41 -00:04:04,800 --> 00:04:10,960 -Transformers library returns logits. To make sense  -of those logits, we need to dig into the third and   - -42 -00:04:10,960 --> 00:04:17,520 -last step of the pipeline: post-processing. To  -convert logits into probabilities, we need to   - -43 -00:04:17,520 --> 00:04:22,800 -apply a SoftMax layer to them. As we can see,  -this transforms them into positive numbers that   - -44 -00:04:22,800 --> 00:04:28,160 -sum up to 1. The last step is to know which of  -those corresponds to the positive or the negative   - -45 -00:04:28,160 --> 00:04:34,720 -label. This is given by the id2label field  -of the model config. The first probabilities   - -46 -00:04:34,720 --> 00:04:40,800 -(index 0) correspond to the negative label, and  -the seconds (index 1) correspond to the positive   - -47 -00:04:40,800 --> 00:04:46,640 -label. This is how our classifier built with the  -pipeline function picked those labels and computed   - -48 -00:04:46,640 --> 00:04:55,840 -those scores. Now that you know how each steps  -works, you can easily tweak them to your needs. +1 +00:00:00,397 --> 00:00:02,980 +(subtle blast) + +2 +00:00:05,490 --> 00:00:07,953 +- What happens inside +the pipeline function? + +3 +00:00:09,930 --> 00:00:13,050 +In this video, we will look +at what actually happens + +4 +00:00:13,050 --> 00:00:14,820 +when we use the pipeline function + +5 +00:00:14,820 --> 00:00:16,920 +of the Transformers library. + +6 +00:00:16,920 --> 00:00:18,930 +More specifically, we will look at + +7 +00:00:18,930 --> 00:00:21,030 +the sentiment analysis pipeline, + +8 +00:00:21,030 --> 00:00:23,760 +and how it went from the +two following sentences + +9 +00:00:23,760 --> 00:00:25,800 +to the positive and negative labels + +10 +00:00:25,800 --> 00:00:27,250 +with their respective scores. + +11 +00:00:28,740 --> 00:00:31,110 +As we have seen in the pipeline video, + +12 +00:00:31,110 --> 00:00:33,900 +there are three stages in the pipeline. + +13 +00:00:33,900 --> 00:00:36,810 +First, we convert the raw texts to numbers + +14 +00:00:36,810 --> 00:00:39,160 +the model can make sense +of, using a tokenizer. + +15 +00:00:40,140 --> 00:00:42,600 +Then, those numbers go through the model, + +16 +00:00:42,600 --> 00:00:44,550 +which outputs logits. + +17 +00:00:44,550 --> 00:00:47,190 +Finally, the post-processing steps + +18 +00:00:47,190 --> 00:00:49,490 +transforms those logits +into labels and score. + +19 +00:00:51,000 --> 00:00:52,590 +Let's look in detail at those three steps, + +20 +00:00:52,590 --> 00:00:55,200 +and how to replicate them +using the Transformers library, + +21 +00:00:55,200 --> 00:00:57,903 +beginning with the first +stage, tokenization. + +22 +00:00:59,905 --> 00:01:02,520 +The tokenization process +has several steps. + +23 +00:01:02,520 --> 00:01:06,900 +First, the text is split into +small chunks called token. + +24 +00:01:06,900 --> 00:01:09,933 +They can be words, parts of +words or punctuation symbols. + +25 +00:01:10,800 --> 00:01:14,310 +Then the tokenizer will +had some special tokens + +26 +00:01:14,310 --> 00:01:15,573 +if the model expect them. + +27 +00:01:16,440 --> 00:01:20,430 +Here, the model used expects +a CLS token at the beginning + +28 +00:01:20,430 --> 00:01:23,910 +and a SEP token at the end +of the sentence to classify. + +29 +00:01:23,910 --> 00:01:27,630 +Lastly, the tokenizer matches +each token to its unique ID + +30 +00:01:27,630 --> 00:01:29,730 +in the vocabulary of the pretrained model. + +31 +00:01:30,660 --> 00:01:32,040 +To load such a tokenizer, + +32 +00:01:32,040 --> 00:01:34,983 +the Transformers library +provides the AutoTokenizer API. + +33 +00:01:35,880 --> 00:01:39,510 +The most important method of +this class is from_pretrained, + +34 +00:01:39,510 --> 00:01:41,940 +which will download and +cache the configuration + +35 +00:01:41,940 --> 00:01:44,913 +and the vocabulary associated +to a given checkpoint. + +36 +00:01:46,410 --> 00:01:48,180 +Here, the checkpoint used by default + +37 +00:01:48,180 --> 00:01:50,310 +for the sentiment analysis pipeline + +38 +00:01:50,310 --> 00:01:54,510 +is distilbert base uncased +finetuned sst2 English, + +39 +00:01:54,510 --> 00:01:55,960 +which is a bit of a mouthful. + +40 +00:01:56,820 --> 00:01:59,760 +We instantiate a tokenizer +associated with that checkpoint, + +41 +00:01:59,760 --> 00:02:01,833 +then feed it the two sentences. + +42 +00:02:02,790 --> 00:02:05,490 +Since those two sentences +are not of the same size, + +43 +00:02:05,490 --> 00:02:07,440 +we will need to pad the shortest one + +44 +00:02:07,440 --> 00:02:09,570 +to be able to build an array. + +45 +00:02:09,570 --> 00:02:10,403 +This is done by the tokenizer + +46 +00:02:10,403 --> 00:02:12,603 +with the option padding=True. + +47 +00:02:14,130 --> 00:02:17,340 +With truncation=True, we +ensure that any sentence longer + +48 +00:02:17,340 --> 00:02:19,953 +than the maximum the model +can handle is truncated. + +49 +00:02:20,820 --> 00:02:24,200 +Lastly, the return_tensors +option tells the tokenizer + +50 +00:02:24,200 --> 00:02:25,773 +to return a PyTorch tensor. + +51 +00:02:26,910 --> 00:02:28,050 +Looking at the result, + +52 +00:02:28,050 --> 00:02:30,450 +we see we have a dictionary with two keys. + +53 +00:02:30,450 --> 00:02:33,840 +Input IDs contains the +IDs of both sentences, + +54 +00:02:33,840 --> 00:02:35,840 +with zeros where the padding is applied. + +55 +00:02:36,750 --> 00:02:38,550 +The second key, attention mask, + +56 +00:02:38,550 --> 00:02:40,650 +indicates where padding has been applied, + +57 +00:02:40,650 --> 00:02:42,750 +so the model does not pay attention to it. + +58 +00:02:43,590 --> 00:02:46,380 +This is all what is inside +the tokenization step. + +59 +00:02:46,380 --> 00:02:49,653 +Now let's have a look at +the second step, the model. + +60 +00:02:51,090 --> 00:02:53,850 +As for the tokenizer, +there is an AutoModel API, + +61 +00:02:53,850 --> 00:02:55,890 +with a from_pretrained method. + +62 +00:02:55,890 --> 00:02:59,100 +It will download and cache +the configuration of the model + +63 +00:02:59,100 --> 00:03:01,560 +as well as the pretrained weights. + +64 +00:03:01,560 --> 00:03:04,830 +However, the AutoModel +API will only instantiate + +65 +00:03:04,830 --> 00:03:06,540 +the body of the model, + +66 +00:03:06,540 --> 00:03:09,120 +that is, the part of +the model that is left + +67 +00:03:09,120 --> 00:03:11,103 +once the pretraining head is removed. + +68 +00:03:12,210 --> 00:03:14,460 +It will output a high-dimensional tensor + +69 +00:03:14,460 --> 00:03:17,190 +that is a representation +of the sentences passed, + +70 +00:03:17,190 --> 00:03:18,930 +but which is not directly useful + +71 +00:03:18,930 --> 00:03:20,480 +for our classification problem. + +72 +00:03:21,930 --> 00:03:24,210 +Here the tensor has two sentences, + +73 +00:03:24,210 --> 00:03:26,070 +each of sixteen token, + +74 +00:03:26,070 --> 00:03:30,393 +and the last dimension is the +hidden size of our model, 768. + +75 +00:03:31,620 --> 00:03:34,020 +To get an output linked to +our classification problem, + +76 +00:03:34,020 --> 00:03:37,800 +we need to use the +AutoModelForSequenceClassification class. + +77 +00:03:37,800 --> 00:03:40,170 +It works exactly as the AutoModel class, + +78 +00:03:40,170 --> 00:03:41,970 +except that it will build a model + +79 +00:03:41,970 --> 00:03:43,353 +with a classification head. + +80 +00:03:44,520 --> 00:03:46,770 +There is one auto class +for each common NLP task + +81 +00:03:46,770 --> 00:03:48,170 +in the Transformers library. + +82 +00:03:49,050 --> 00:03:52,380 +Here, after giving our +model the two sentences, + +83 +00:03:52,380 --> 00:03:54,600 +we get a tensor of size two by two; + +84 +00:03:54,600 --> 00:03:57,783 +one result for each sentence +and for each possible label. + +85 +00:03:59,100 --> 00:04:01,470 +Those outputs are not probabilities yet. + +86 +00:04:01,470 --> 00:04:03,660 +We can see they don't sum to 1. + +87 +00:04:03,660 --> 00:04:06,090 +This is because each model +of the Transformers library + +88 +00:04:06,090 --> 00:04:07,830 +returns logits. + +89 +00:04:07,830 --> 00:04:09,480 +To make sense of those logits, + +90 +00:04:09,480 --> 00:04:10,980 +we need to dig into the third + +91 +00:04:10,980 --> 00:04:13,653 +and last step of the +pipeline, post-processing. + +92 +00:04:15,300 --> 00:04:17,310 +To convert logits into probabilities, + +93 +00:04:17,310 --> 00:04:19,950 +we need to apply a SoftMax layer to them. + +94 +00:04:19,950 --> 00:04:22,800 +As we can see, this transforms +them into positive numbers + +95 +00:04:22,800 --> 00:04:23,793 +that sum up to 1. + +96 +00:04:24,990 --> 00:04:27,030 +The last step is to know +which of those corresponds + +97 +00:04:27,030 --> 00:04:29,400 +to the positive or the negative label. + +98 +00:04:29,400 --> 00:04:33,480 +This is given by the id2label +field of the model config. + +99 +00:04:33,480 --> 00:04:36,000 +The first probabilities, index 0, + +100 +00:04:36,000 --> 00:04:37,740 +correspond to the negative label, + +101 +00:04:37,740 --> 00:04:42,060 +and the seconds, index 1, +correspond to the positive label. + +102 +00:04:42,060 --> 00:04:43,830 +This is how our classifier built + +103 +00:04:43,830 --> 00:04:46,260 +with the pipeline function +picked those labels + +104 +00:04:46,260 --> 00:04:47,560 +and computed those scores. + +105 +00:04:48,420 --> 00:04:50,400 +Now that you know how each steps works, + +106 +00:04:50,400 --> 00:04:52,533 +you can easily tweak them to your needs. + +107 +00:04:55,314 --> 00:04:57,897 +(subtle blast) + diff --git a/subtitles/en/10_instantiate-a-transformers-model-(pytorch).srt b/subtitles/en/10_instantiate-a-transformers-model-(pytorch).srt index f7b29b5ad..d29c6933f 100644 --- a/subtitles/en/10_instantiate-a-transformers-model-(pytorch).srt +++ b/subtitles/en/10_instantiate-a-transformers-model-(pytorch).srt @@ -1,157 +1,308 @@ -1 -00:00:05,120 --> 00:00:07,440 -How to instantiate a Transformers model?   - -2 -00:00:08,640 --> 00:00:12,960 -In this video we will look at how we can create  -and use a model from the Transformers library.   - -3 -00:00:14,160 --> 00:00:19,440 -As we've seen before, the AutoModel class allows  -you to instantiate a pretrained model from any   - -4 -00:00:19,440 --> 00:00:24,960 -checkpoint on the Hugging Face Hub. It will  -pick the right model class from the library to   - -5 -00:00:24,960 --> 00:00:30,800 -instantiate the proper architecture and load the  -weights of the pretrained model inside it. As we   - -6 -00:00:30,800 --> 00:00:37,760 -can see, when given a BERT checkpoint, we end up  -with a BertModel, and similarly for GPT-2 or BART.   - -7 -00:00:39,680 --> 00:00:43,440 -Behind the scenes, this API can take  -the name of a checkpoint on the Hub,   - -8 -00:00:44,080 --> 00:00:48,400 -in which case it will download and cache the  -configuration file as well as the model weights   - -9 -00:00:48,400 --> 00:00:54,800 -file. You can also specify the path to a local  -folder that contains a valid configuration file   - -10 -00:00:54,800 --> 00:01:00,720 -and a model weights file. To instantiate the  -pretrained model, the AutoModel API will first   - -11 -00:01:00,720 --> 00:01:04,960 -open the configuration file to look at the  -configuration class that should be used.   - -12 -00:01:06,080 --> 00:01:12,240 -The configuration class depends on the type of  -the model (BERT, GPT-2 or BART for instance).   - -13 -00:01:13,440 --> 00:01:18,160 -Once it has the proper configuration class,  -it can instantiate that configuration,   - -14 -00:01:18,160 --> 00:01:23,920 -which is a blueprint to know how to create the  -model. It also uses this configuration class   - -15 -00:01:23,920 --> 00:01:29,360 -to find the proper model class, which is combined  -with the loaded configuration, to load the model.   - -16 -00:01:30,800 --> 00:01:35,520 -This model is not yet our pretrained model as it  -has just been initialized with random weights.   - -17 -00:01:36,560 --> 00:01:42,960 -The last step is to load the weights from the  -model file inside this model. To easily load   - -18 -00:01:42,960 --> 00:01:47,360 -the configuration of a model from any checkpoint  -or a folder containing the configuration folder,   - -19 -00:01:48,000 --> 00:01:49,920 -we can use the AutoConfig class.   - -20 -00:01:51,040 --> 00:01:55,360 -Like the AutoModel class, it will pick the  -right configuration class from the library.   - -21 -00:01:56,800 --> 00:02:01,360 -We can also use the specific class corresponding  -to a checkpoint, but we will need to change the   - -22 -00:02:01,360 --> 00:02:08,320 -code each time we want to try a different model.  -As we said before, the configuration of a model is   - -23 -00:02:08,320 --> 00:02:12,720 -a blueprint that contains all the information  -necessary to create the model architecture.   - -24 -00:02:13,600 --> 00:02:19,680 -For instance the BERT model associated with  -the bert-base-cased checkpoint has 12 layers,   - -25 -00:02:19,680 --> 00:02:29,120 -a hidden size of 768, and a vocabulary size  -of 28,996. Once we have the configuration,   - -26 -00:02:29,680 --> 00:02:33,120 -we can create a model that has the same  -architecture as our checkpoint but is   - -27 -00:02:33,120 --> 00:02:37,840 -randomly initialized. We can then train it from  -scratch like any PyTorch module/TensorFlow model.   - -28 -00:02:38,800 --> 00:02:42,960 -We can also change any part of the  -configuration by using keyword arguments.   - -29 -00:02:43,920 --> 00:02:49,280 -The second snippet of code instantiates a  -randomly initialized BERT model with ten layers   - -30 -00:02:49,280 --> 00:02:56,160 -instead of 12. Saving a model once it's trained  -or fine-tuned is very easy: we just have to use   - -31 -00:02:56,160 --> 00:03:02,880 -the save_pretrained method. Here the model will  -be saved in a folder named my-bert-model inside   - -32 -00:03:02,880 --> 00:03:08,240 -the current working directory. Such a model can  -then be reloaded using the from_pretrained method. +1 +00:00:00,519 --> 00:00:03,186 +(logo swooshes) + +2 +00:00:05,310 --> 00:00:08,483 +- How to instantiate a Transformers model. + +3 +00:00:08,483 --> 00:00:11,790 +In this video, we'll look at +how we can create a user model + +4 +00:00:11,790 --> 00:00:13,290 +from the Transformers library. + +5 +00:00:14,310 --> 00:00:17,100 +As we have seen before +the AutoModel class allows + +6 +00:00:17,100 --> 00:00:19,140 +you to instantiate a pretrained model + +7 +00:00:19,140 --> 00:00:21,513 +from any checkpoint on +the Hugging Face Hub. + +8 +00:00:22,350 --> 00:00:23,910 +It'll pick the right model class + +9 +00:00:23,910 --> 00:00:26,654 +from the library to instantiate +the proper architecture + +10 +00:00:26,654 --> 00:00:29,793 +and loads of weights as the +pretrained model inside. + +11 +00:00:30,690 --> 00:00:33,810 +As we can see, when +given a BERT checkpoint + +12 +00:00:33,810 --> 00:00:38,043 +we end up with a BertModel and +similarly, for GPT-2 or BART. + +13 +00:00:40,020 --> 00:00:42,360 +Behind the scenes,this +API can take the name + +14 +00:00:42,360 --> 00:00:44,250 +of a checkpoint on the Hub + +15 +00:00:44,250 --> 00:00:46,980 +in which case it will download +and cache the configuration + +16 +00:00:46,980 --> 00:00:48,843 +file as well as a model weights file. + +17 +00:00:49,698 --> 00:00:52,710 +You can also specify the +path to a local folder + +18 +00:00:52,710 --> 00:00:55,290 +that contains a valid +configuration file and a + +19 +00:00:55,290 --> 00:00:56,390 +model of weights file. + +20 +00:00:57,600 --> 00:00:59,479 +To instantiate the pretrained model, + +21 +00:00:59,479 --> 00:01:01,950 +the AutoModel API will +first open the configuration + +22 +00:01:01,950 --> 00:01:05,403 +file to look at a configuration +class that should be used. + +23 +00:01:06,420 --> 00:01:08,580 +The configuration class +depends on the type + +24 +00:01:08,580 --> 00:01:12,663 +of the model BERT, GPT-2 +or BART for instance. + +25 +00:01:13,680 --> 00:01:15,930 +Once it has a proper configuration class, + +26 +00:01:15,930 --> 00:01:18,390 +it can instantiate that configuration + +27 +00:01:18,390 --> 00:01:21,900 +which is a blueprint to know +how to create the model. + +28 +00:01:21,900 --> 00:01:24,240 +It also uses this configuration class to + +29 +00:01:24,240 --> 00:01:27,150 +find the proper model class, +which is then combined + +30 +00:01:27,150 --> 00:01:29,823 +with the loaded configuration +to load the model. + +31 +00:01:30,904 --> 00:01:33,210 +This model is not yet a pretrained model + +32 +00:01:33,210 --> 00:01:35,883 +as it has just been initialized +with random weights. + +33 +00:01:36,840 --> 00:01:39,810 +The last step is to load the +weight from the model file + +34 +00:01:39,810 --> 00:01:40,923 +inside this model. + +35 +00:01:42,330 --> 00:01:44,250 +To easily load the +configuration of a model + +36 +00:01:44,250 --> 00:01:46,410 +from any checkpoint or folder containing + +37 +00:01:46,410 --> 00:01:48,210 +the configuration file. + +38 +00:01:48,210 --> 00:01:50,373 +We can use the AutoConfig class. + +39 +00:01:51,240 --> 00:01:52,693 +Like the AutoModel class, + +40 +00:01:52,693 --> 00:01:55,693 +it will pick the right configuration +class from the library. + +41 +00:01:57,060 --> 00:01:59,220 +We can also use a specific +class corresponding + +42 +00:01:59,220 --> 00:02:01,470 +to a checkpoint, but +we will need to change + +43 +00:02:01,470 --> 00:02:03,000 +the code each time we want to try + +44 +00:02:03,000 --> 00:02:04,550 +a different model architecture. + +45 +00:02:06,030 --> 00:02:07,860 +As we said before, the configuration + +46 +00:02:07,860 --> 00:02:10,350 +of a model is a blueprint +that contains all the + +47 +00:02:10,350 --> 00:02:13,830 +information necessary to +create the model architecture. + +48 +00:02:13,830 --> 00:02:15,990 +For instance, the BERT model associated + +49 +00:02:15,990 --> 00:02:19,980 +with the bert-base-cased +checkpoint has 12 layers, + +50 +00:02:19,980 --> 00:02:24,980 +a hidden side of 768 and a +vocabulary side of 28,996. + +51 +00:02:28,020 --> 00:02:29,910 +Once we have the configuration, + +52 +00:02:29,910 --> 00:02:31,950 +we can create a model that +does the same architecture + +53 +00:02:31,950 --> 00:02:35,280 +as our checkpoint, but +is randomly initialized. + +54 +00:02:35,280 --> 00:02:36,660 +We can then train it from scratch. + +55 +00:02:36,660 --> 00:02:38,010 +Like any bio PyTorch module + +56 +00:02:39,497 --> 00:02:40,380 +We can also change any part + +57 +00:02:40,380 --> 00:02:43,200 +of the configuration by +using keyword arguments. + +58 +00:02:43,200 --> 00:02:46,138 +The second snippet of code instantiates + +59 +00:02:46,138 --> 00:02:48,360 +a randomly initialized BERT model + +60 +00:02:48,360 --> 00:02:50,403 +with 10 layers instead of 12. + +61 +00:02:51,409 --> 00:02:55,051 +Saving a model once it's trained +or fine-tuned is very easy. + +62 +00:02:55,051 --> 00:02:57,603 +We just have to use a +safe pretrained method. + +63 +00:02:58,500 --> 00:03:01,417 +Here the model will be +saved in a folder named + +64 +00:03:01,417 --> 00:03:04,473 +"my-bert-model" inside the +current working directory. + +65 +00:03:05,400 --> 00:03:08,255 +Such a model can then be +reloaded using the form + +66 +00:03:08,255 --> 00:03:09,596 +pretrained method. + +67 +00:03:09,596 --> 00:03:11,250 +To learn how to easily approach this model + +68 +00:03:11,250 --> 00:03:13,473 +to that, check out the push to a video. + diff --git a/subtitles/en/11_instantiate-a-transformers-model-(tensorflow).srt b/subtitles/en/11_instantiate-a-transformers-model-(tensorflow).srt index dda26cb36..17a04807a 100644 --- a/subtitles/en/11_instantiate-a-transformers-model-(tensorflow).srt +++ b/subtitles/en/11_instantiate-a-transformers-model-(tensorflow).srt @@ -1,195 +1,317 @@ -1 -00:00:05,540 --> 00:00:07,870 -How to instantiate a Transformers model? - -2 -00:00:07,870 --> 00:00:14,800 -In this video we will look at how we can create -and use a model from the Transformers library. - -3 -00:00:14,800 --> 00:00:20,130 -As we've seen before, the TFAutoModel class -allows you to instantiate a pretrained model - -4 -00:00:20,130 --> 00:00:23,490 -from any checkpoint on the Hugging Face Hub. - -5 -00:00:23,490 --> 00:00:27,740 -It will pick the right model class from the -library to instantiate the proper architecture - -6 -00:00:27,740 --> 00:00:31,310 -and load the weights of the pretrained model -inside it. - -7 -00:00:31,310 --> 00:00:36,630 -As we can see, when given a BERT checkpoint, -we end up with a TFBertModel, and similarly - -8 -00:00:36,630 --> 00:00:39,890 -for GPT-2 or BART. - -9 -00:00:39,890 --> 00:00:44,489 -Behind the scenes, this API can take the name -of a checkpoint on the Hub, in which case - -10 -00:00:44,489 --> 00:00:49,649 -it will download and cache the configuration -file as well as the model weights file. - -11 -00:00:49,649 --> 00:00:54,059 -You can also specify the path to a local folder -that contains a valid configuration file and - -12 -00:00:54,059 --> 00:00:56,739 -a model weights file. - -13 -00:00:56,739 --> 00:01:02,480 -To instantiate the pretrained model, the AutoModel -API will first open the configuration file - -14 -00:01:02,480 --> 00:01:06,409 -to look at the configuration class that should -be used. - -15 -00:01:06,409 --> 00:01:13,509 -The configuration class depends on the type -of the model (BERT, GPT-2 or BART for instance). - -16 -00:01:13,509 --> 00:01:18,130 -Once it has the proper configuration class, -it can instantiate that configuration, which - -17 -00:01:18,130 --> 00:01:20,420 -is a blueprint to know how to create the model. - -18 -00:01:20,420 --> 00:01:25,420 -It also uses this configuration class to find -the proper model class, which is combined - -19 -00:01:25,420 --> 00:01:28,470 -with the loaded configuration, to load the -model. - -20 -00:01:28,470 --> 00:01:33,759 -This model is not yet our pretrained model -as it has just been initialized with random - -21 -00:01:33,759 --> 00:01:34,759 -weights. - -22 -00:01:34,759 --> 00:01:40,299 -The last step is to load the weights from -the model file inside this model. - -23 -00:01:40,299 --> 00:01:44,659 -To easily load the configuration of a model -from any checkpoint or a folder containing - -24 -00:01:44,659 --> 00:01:48,100 -the configuration folder, we can use the AutoConfig -class. - -25 -00:01:48,100 --> 00:01:54,270 -Like the TFAutoModel class, it will pick the -right configuration class from the library. - -26 -00:01:54,270 --> 00:01:58,869 -We can also use the specific class corresponding -to a checkpoint, but we will need to change - -27 -00:01:58,869 --> 00:02:03,280 -the code each time we want to try a different -model. - -28 -00:02:03,280 --> 00:02:07,490 -As we said before, the configuration of a -model is a blueprint that contains all the - -29 -00:02:07,490 --> 00:02:11,190 -information necessary to create the model -architecture. - -30 -00:02:11,190 --> 00:02:14,629 -For instance the BERT model associated with -the bert-base-cased checkpoint has 12 layers, - -31 -00:02:14,629 --> 00:02:21,790 -a hidden size of 768, and a vocabulary size -of 28,996. - -32 -00:02:21,790 --> 00:02:28,959 -Once we have the configuration, we can create -a model that has the same architecture as - -33 -00:02:28,959 --> 00:02:31,420 -our checkpoint but is randomly initialized. - -34 -00:02:31,420 --> 00:02:36,080 -We can then train it from scratch like any -PyTorch module/TensorFlow model. - -35 -00:02:36,080 --> 00:02:40,870 -We can also change any part of the configuration -by using keyword arguments. - -36 -00:02:40,870 --> 00:02:45,860 -The second snippet of code instantiates a -randomly initialized BERT model with ten layers - -37 -00:02:45,860 --> 00:02:48,379 -instead of 12. - -38 -00:02:48,379 --> 00:02:53,019 -Saving a model once it's trained or fine-tuned -is very easy: we just have to use the save_pretrained - -39 -00:02:53,019 --> 00:02:54,019 -method. - -40 -00:02:54,019 --> 00:03:00,510 -Here the model will be saved in a folder named -my-bert-model inside the current working directory. - -41 -00:03:00,510 --> 00:03:13,120 -Such a model can then be reloaded using the -from_pretrained method. +1 +00:00:00,125 --> 00:00:02,958 +(whooshing sound) + +2 +00:00:05,463 --> 00:00:08,820 +- How to instantiate +the Transformers model? + +3 +00:00:08,820 --> 00:00:11,250 +In this video, we will +look at how we can create + +4 +00:00:11,250 --> 00:00:13,550 +and use a model from the +Transformers library. + +5 +00:00:15,000 --> 00:00:17,850 +As we've seen before, +the TFAutoModel class + +6 +00:00:17,850 --> 00:00:20,100 +allows you to instantiate +a pre-trained model + +7 +00:00:20,100 --> 00:00:22,503 +from any checkpoint on +the Hugging Face Hub. + +8 +00:00:23,430 --> 00:00:25,620 +It will pick the right +model class from the library + +9 +00:00:25,620 --> 00:00:27,750 +to instantiate the proper architecture + +10 +00:00:27,750 --> 00:00:31,200 +and load the weights of the +pre-trained model inside. + +11 +00:00:31,200 --> 00:00:34,020 +As we can see, when +given a BERT checkpoint, + +12 +00:00:34,020 --> 00:00:36,090 +we end up with a TFBertModel, + +13 +00:00:36,090 --> 00:00:38,553 +and similarly for GPT2 or BART. + +14 +00:00:40,170 --> 00:00:42,510 +Behind the scenes, this +API can take the name + +15 +00:00:42,510 --> 00:00:44,040 +of a checkpoint on the Hub, + +16 +00:00:44,040 --> 00:00:45,810 +in which case it will download and cache + +17 +00:00:45,810 --> 00:00:48,660 +the configuration file as well +as the model weights file. + +18 +00:00:49,590 --> 00:00:52,020 +You can also specify the +path to a local folder + +19 +00:00:52,020 --> 00:00:54,090 +that contains a valid configuration file + +20 +00:00:54,090 --> 00:00:55,340 +and a model weights file. + +21 +00:00:56,670 --> 00:00:58,167 +To instantiate the pre-trained model, + +22 +00:00:58,167 --> 00:01:02,400 +the TFAutoModel API will first +open the configuration file + +23 +00:01:02,400 --> 00:01:05,253 +to look at the configuration +class that should be used. + +24 +00:01:06,390 --> 00:01:09,660 +The configuration class depends +on the type of the model, + +25 +00:01:09,660 --> 00:01:12,333 +BERT, GPT2 or BART for instance. + +26 +00:01:13,320 --> 00:01:15,720 +Once it has the proper +configuration class, + +27 +00:01:15,720 --> 00:01:18,000 +it can instantiate that configuration, + +28 +00:01:18,000 --> 00:01:21,090 +which is a blueprint to know +how to create the model. + +29 +00:01:21,090 --> 00:01:22,770 +It also uses this configuration class + +30 +00:01:22,770 --> 00:01:24,750 +to find the proper model class, + +31 +00:01:24,750 --> 00:01:27,120 +which is combined with +the loaded configuration + +32 +00:01:27,120 --> 00:01:28,143 +to load the model. + +33 +00:01:29,250 --> 00:01:31,800 +This model is not yet +our pre-trained model + +34 +00:01:31,800 --> 00:01:34,560 +as it has just been initialized +with random weights. + +35 +00:01:34,560 --> 00:01:36,690 +The last step is to load the weights + +36 +00:01:36,690 --> 00:01:38,973 +from the model file inside this model. + +37 +00:01:40,230 --> 00:01:42,270 +To easily load the +configuration of a model + +38 +00:01:42,270 --> 00:01:44,220 +from any checkpoint or a folder + +39 +00:01:44,220 --> 00:01:46,170 +containing the configuration file, + +40 +00:01:46,170 --> 00:01:47,790 +we can use the AutoConfig class. + +41 +00:01:47,790 --> 00:01:50,460 +Like the TFAutoModel class, + +42 +00:01:50,460 --> 00:01:54,210 +it will pick the right configuration +class from the library. + +43 +00:01:54,210 --> 00:01:56,040 +We can also use the specific class + +44 +00:01:56,040 --> 00:01:57,840 +corresponding to a checkpoint, + +45 +00:01:57,840 --> 00:01:59,430 +but we will need to change the code + +46 +00:01:59,430 --> 00:02:02,230 +each time we want to try a +different model architecture. + +47 +00:02:03,180 --> 00:02:05,353 +As we said before, the +configuration of a model + +48 +00:02:05,353 --> 00:02:08,610 +is a blueprint that contains +all the information necessary + +49 +00:02:08,610 --> 00:02:11,070 +to create the model architecture. + +50 +00:02:11,070 --> 00:02:12,750 +For instance, the BERT model + +51 +00:02:12,750 --> 00:02:15,510 +associated with the +bert-base-cased checkpoint + +52 +00:02:15,510 --> 00:02:19,710 +has 12 layers, a hidden size of 768, + +53 +00:02:19,710 --> 00:02:23,403 +and a vocabulary size of 28,996. + +54 +00:02:24,810 --> 00:02:26,670 +Once we have the configuration, + +55 +00:02:26,670 --> 00:02:28,890 +we can create a model that +has the same architecture + +56 +00:02:28,890 --> 00:02:32,160 +as our checkpoint but +is randomly initialized. + +57 +00:02:32,160 --> 00:02:36,030 +We can then train it from scratch +like any TensorFlow model. + +58 +00:02:36,030 --> 00:02:38,063 +We can also change any +part of the configuration + +59 +00:02:38,063 --> 00:02:40,770 +by using keyword arguments. + +60 +00:02:40,770 --> 00:02:43,110 +The second snippet of code instantiates + +61 +00:02:43,110 --> 00:02:44,970 +a randomly initialized BERT model + +62 +00:02:44,970 --> 00:02:46,983 +with 10 layers instead of 12. + +63 +00:02:48,240 --> 00:02:51,360 +Saving a model once it's trained +or fine-tuned is very easy. + +64 +00:02:51,360 --> 00:02:53,880 +We just have to use the +save_pretrained method. + +65 +00:02:53,880 --> 00:02:55,980 +Here, the model will be saved in a folder + +66 +00:02:55,980 --> 00:02:59,463 +named my-bert-model inside +the current working directory. + +67 +00:03:00,480 --> 00:03:02,250 +Such a model can then be reloaded + +68 +00:03:02,250 --> 00:03:04,500 +using the from_pretrained method. + +69 +00:03:04,500 --> 00:03:06,600 +To run it to a projects model to the Hub, + +70 +00:03:06,600 --> 00:03:08,350 +check out the push (mumbles) video. + +71 +00:03:09,355 --> 00:03:12,188 +(whooshing sound) + diff --git a/subtitles/en/12_tokenizers-overview.srt b/subtitles/en/12_tokenizers-overview.srt index e60183723..cc5880413 100644 --- a/subtitles/en/12_tokenizers-overview.srt +++ b/subtitles/en/12_tokenizers-overview.srt @@ -1,38 +1,99 @@ -1 -00:00:03,840 --> 00:00:09,200 -In these few videos, we'll take a look at the  -tokenizers. In Natural Language Processing,   - -2 -00:00:09,200 --> 00:00:14,880 -most of the data that we handle consists of raw  -text. However, machine learning models cannot read   - -3 -00:00:14,880 --> 00:00:23,200 -and understand text in its raw form they can only  -work with numbers. The tokenizer's objective will   - -4 -00:00:23,200 --> 00:00:30,080 -be to translate the text into numbers. There are  -several possible approaches to this conversion,   - -5 -00:00:30,080 --> 00:00:33,120 -and the objective is to find the  -most meaningful representation.   - -6 -00:00:36,000 --> 00:00:40,400 -We'll take a look at three distinct tokenization  -algorithms. We compare them one to one,   - -7 -00:00:40,400 --> 00:00:44,880 -so we recommend you look at the videos  -in the following order: Word-based,   - -8 -00:00:45,680 --> 00:00:55,680 -Character-based, and Subword-based. +1 +00:00:00,450 --> 00:00:01,509 +(intro whooshing) + +2 +00:00:01,509 --> 00:00:02,720 +(smiley snapping) + +3 +00:00:02,720 --> 00:00:03,930 +(words whooshing) + +4 +00:00:03,930 --> 00:00:04,920 +- In the next few videos, + +5 +00:00:04,920 --> 00:00:06,720 +we'll take a look at the tokenizers. + +6 +00:00:07,860 --> 00:00:09,240 +In natural language processing, + +7 +00:00:09,240 --> 00:00:12,930 +most of the data that we +handle consists of raw text. + +8 +00:00:12,930 --> 00:00:14,280 +However, machine learning models + +9 +00:00:14,280 --> 00:00:17,103 +cannot read or understand +text in its raw form, + +10 +00:00:18,540 --> 00:00:20,253 +they can only work with numbers. + +11 +00:00:21,360 --> 00:00:23,220 +So the tokenizer's objective + +12 +00:00:23,220 --> 00:00:25,923 +will be to translate +the text into numbers. + +13 +00:00:27,600 --> 00:00:30,240 +There are several possible +approaches to this conversion, + +14 +00:00:30,240 --> 00:00:31,110 +and the objective + +15 +00:00:31,110 --> 00:00:33,453 +is to find the most +meaningful representation. + +16 +00:00:36,240 --> 00:00:39,390 +We'll take a look at three +distinct tokenization algorithms. + +17 +00:00:39,390 --> 00:00:40,530 +We compare them one to one, + +18 +00:00:40,530 --> 00:00:42,600 +so we recommend you take +a look at the videos + +19 +00:00:42,600 --> 00:00:44,040 +in the following order. + +20 +00:00:44,040 --> 00:00:45,390 +First, "Word-based," + +21 +00:00:45,390 --> 00:00:46,800 +followed by "Character-based," + +22 +00:00:46,800 --> 00:00:48,877 +and finally, "Subword-based." + +23 +00:00:48,877 --> 00:00:51,794 +(outro whooshing) + diff --git a/subtitles/en/13_word-based-tokenizers.srt b/subtitles/en/13_word-based-tokenizers.srt index ffd34249d..a2908fd3b 100644 --- a/subtitles/en/13_word-based-tokenizers.srt +++ b/subtitles/en/13_word-based-tokenizers.srt @@ -1,128 +1,264 @@ -1 -00:00:03,120 --> 00:00:10,240 -Let's take a look at word-based tokenization.  -Word-based tokenization is the idea of splitting   - -2 -00:00:10,240 --> 00:00:19,040 -the raw text into words, by splitting on spaces  -or other specific rules like punctuation. In this   - -3 -00:00:19,040 --> 00:00:25,040 -algorithm, each word has a specific number, an  -"ID", attributed to it. In this example, "Let's"   - -4 -00:00:25,040 --> 00:00:33,120 -has the ID 250, do has ID 861, and tokenization  -followed by an exclamation point has the ID 345.   - -5 -00:00:34,160 --> 00:00:39,840 -This approach is interesting, as the model has  -representations that are based on entire words.   - -6 -00:00:42,560 --> 00:00:45,680 -The information held in a single number is high   - -7 -00:00:45,680 --> 00:00:52,880 -as a word contains a lot of contextual  -and semantic information in a sentence.   - -8 -00:00:52,880 --> 00:00:58,720 -However, this approach does have its limits.  -For example, the word dog and the word   - -9 -00:00:58,720 --> 00:01:04,320 -dogs are very similar, and their meaning is  -close. However, the word-based tokenization   - -10 -00:01:05,280 --> 00:01:10,320 -will attribute entirely different IDs to these  -two words, and the model will therefore learn   - -11 -00:01:10,320 --> 00:01:14,880 -different meanings for these two words. This  -is unfortunate, as we would like the model   - -12 -00:01:14,880 --> 00:01:21,120 -to understand that these words are indeed related  -and that dogs is the plural form of the word dog.   - -13 -00:01:22,800 --> 00:01:26,400 -Another issue with this approach is that there  -are a lot of different words in a language.   - -14 -00:01:27,840 --> 00:01:31,920 -If we want our model to understand all  -possible sentences in that language,   - -15 -00:01:31,920 --> 00:01:37,200 -then we will need to have an ID for each  -different word, and the total number of words,   - -16 -00:01:37,200 --> 00:01:41,440 -which is also known as the vocabulary  -size, can quickly become very large.   - -17 -00:01:44,160 --> 00:01:48,800 -This is an issue because each ID is mapped to a  -large vector that represents the word's meaning,   - -18 -00:01:50,000 --> 00:01:55,840 -and keeping track of these mappings requires an  -enormous number of weights when the vocabulary   - -19 -00:01:55,840 --> 00:02:03,360 -size is large. If we want our models to stay  -lean, we can opt for our tokenizer to ignore   - -20 -00:02:03,360 --> 00:02:11,760 -certain words that we don't necessarily need. For  -example, when training our tokenizer on a text,   - -21 -00:02:11,760 --> 00:02:15,680 -we might want to take the 10,000  -most frequent words in that text   - -22 -00:02:20,640 --> 00:02:23,520 -to create our basic vocabulary, instead  -of taking all of that language's words.   - -23 -00:02:23,520 --> 00:02:27,200 -The tokenizer will know how to convert  -those 10,000 words into numbers,   - -24 -00:02:27,200 --> 00:02:33,520 -but any other word will be converted to the  -out-of-vocabulary word, or the "unknown" word.   - -25 -00:02:36,000 --> 00:02:39,760 -This can rapidly become an issue: the model  -will have the exact same representation   - -26 -00:02:39,760 --> 00:02:44,720 -for all words that it doesn't know, which  -will result in a lot of lost information. +1 +00:00:00,165 --> 00:00:01,416 +(screen whooshing) + +2 +00:00:01,416 --> 00:00:02,716 +(sticker popping) + +3 +00:00:02,716 --> 00:00:03,549 +(screen whooshing) + +4 +00:00:03,549 --> 00:00:05,603 +- Let's take a look at +word-based tokenization. + +5 +00:00:07,650 --> 00:00:09,780 +Word-based tokenization is the idea + +6 +00:00:09,780 --> 00:00:11,940 +of splitting the raw text into words + +7 +00:00:11,940 --> 00:00:14,673 +by splitting on spaces +or other specific rules, + +8 +00:00:16,020 --> 00:00:17,163 +like punctuation. + +9 +00:00:18,900 --> 00:00:21,810 +In this algorithm, each +word has a specific number + +10 +00:00:21,810 --> 00:00:23,463 +or ID attributed to it. + +11 +00:00:24,360 --> 00:00:27,270 +Here, let's has the ID 250, + +12 +00:00:27,270 --> 00:00:30,150 +do has 861, and tokenization + +13 +00:00:30,150 --> 00:00:33,393 +followed by an exclamation mark has 345. + +14 +00:00:34,380 --> 00:00:36,000 +This approach is interesting + +15 +00:00:36,000 --> 00:00:38,100 +as the model has representations + +16 +00:00:38,100 --> 00:00:40,233 +that are based on entire words. + +17 +00:00:42,720 --> 00:00:45,960 +The information held in +a single number is high, + +18 +00:00:45,960 --> 00:00:48,240 +as a word contains a lot of contextual + +19 +00:00:48,240 --> 00:00:49,803 +and semantic information. + +20 +00:00:53,070 --> 00:00:55,473 +However, this approach +does have its limits. + +21 +00:00:56,610 --> 00:01:00,570 +For example, the word dog and +the word dogs are very similar + +22 +00:01:00,570 --> 00:01:01,923 +and their meaning is close. + +23 +00:01:03,210 --> 00:01:05,550 +The word-based tokenization, however, + +24 +00:01:05,550 --> 00:01:08,520 +will attribute entirely +different IDs to these two words + +25 +00:01:08,520 --> 00:01:10,110 +and the model will therefore learn + +26 +00:01:10,110 --> 00:01:12,930 +two different embeddings +for these two words. + +27 +00:01:12,930 --> 00:01:15,090 +This is unfortunate as +we would like the model + +28 +00:01:15,090 --> 00:01:18,240 +to understand that these +words are indeed related, + +29 +00:01:18,240 --> 00:01:21,483 +and that dogs is simply the +plural form of the word dog. + +30 +00:01:22,980 --> 00:01:24,480 +Another issue with this approach, + +31 +00:01:24,480 --> 00:01:28,050 +is that there are a lot of +different words in the language. + +32 +00:01:28,050 --> 00:01:29,490 +If we want our model to understand + +33 +00:01:29,490 --> 00:01:32,160 +all possible sentences in that language, + +34 +00:01:32,160 --> 00:01:35,850 +then we will need to have an +ID for each different word. + +35 +00:01:35,850 --> 00:01:37,380 +And the total number of words, + +36 +00:01:37,380 --> 00:01:40,080 +which is also known as +the vocabulary size, + +37 +00:01:40,080 --> 00:01:41,913 +can quickly become very large. + +38 +00:01:44,400 --> 00:01:47,640 +This is an issue because each +ID is mapped to a large vector + +39 +00:01:47,640 --> 00:01:50,190 +that represents the word's meaning, + +40 +00:01:50,190 --> 00:01:52,170 +and keeping track of these mappings + +41 +00:01:52,170 --> 00:01:54,990 +requires an enormous number of weights + +42 +00:01:54,990 --> 00:01:57,123 +when the vocabulary size is very large. + +43 +00:01:59,160 --> 00:02:00,960 +If we want our models to stay lean, + +44 +00:02:00,960 --> 00:02:04,440 +we can opt for our tokenizer +to ignore certain words + +45 +00:02:04,440 --> 00:02:06,093 +that we don't necessarily need. + +46 +00:02:08,400 --> 00:02:11,970 +For example, here, when training +our tokenizer on a text, + +47 +00:02:11,970 --> 00:02:15,020 +we might want to take only +the 10,000 most frequent words + +48 +00:02:15,020 --> 00:02:16,320 +in that text. + +49 +00:02:16,320 --> 00:02:18,600 +Rather than taking all +words from in that text + +50 +00:02:18,600 --> 00:02:22,503 +or all languages words to +create our basic vocabulary. + +51 +00:02:23,790 --> 00:02:26,520 +The tokenizer will know how +to convert those 10,000 words + +52 +00:02:26,520 --> 00:02:29,370 +into numbers, but any other +word will be converted + +53 +00:02:29,370 --> 00:02:31,530 +to the out-of-vocabulary word, + +54 +00:02:31,530 --> 00:02:33,783 +or like shown here, the unknown word. + +55 +00:02:35,280 --> 00:02:37,440 +Unfortunately, this is a compromise. + +56 +00:02:37,440 --> 00:02:39,900 +The model will have the +exact same representation + +57 +00:02:39,900 --> 00:02:42,390 +for all words that it doesn't know, + +58 +00:02:42,390 --> 00:02:45,210 +which can result in a +lot of lost information + +59 +00:02:45,210 --> 00:02:47,664 +if many unknown words are present. + +60 +00:02:47,664 --> 00:02:50,581 +(screen whooshing) + diff --git a/subtitles/en/14_character-based-tokenizers.srt b/subtitles/en/14_character-based-tokenizers.srt index 1b3fcd616..c86407bd6 100644 --- a/subtitles/en/14_character-based-tokenizers.srt +++ b/subtitles/en/14_character-based-tokenizers.srt @@ -1,134 +1,278 @@ -1 -00:00:04,160 --> 00:00:09,440 -Before diving in character-based tokenization,  -understanding why this kind of tokenization   - -2 -00:00:09,440 --> 00:00:13,680 -is interesting requires understanding  -the flaws of word-based tokenization.   - -3 -00:00:14,560 --> 00:00:18,400 -If you haven't seen the first video on  -word-based tokenization we recommend you   - -4 -00:00:18,400 --> 00:00:23,920 -check it out before looking at this video. Let's  -take a look at character-based tokenization.   - -5 -00:00:25,440 --> 00:00:29,840 -We now split our text into individual  -characters, rather than words.   - -6 -00:00:32,720 --> 00:00:37,200 -There are generally a lot of different words in  -languages, while the number of characters stays   - -7 -00:00:37,200 --> 00:00:45,520 -low. Here for example, for the English language  -that has an estimated 170,000 different words,   - -8 -00:00:45,520 --> 00:00:48,960 -we would need a very large  -vocabulary to encompass all words.   - -9 -00:00:50,080 --> 00:00:55,040 -With a character-based vocabulary, we  -can get by with only 256 characters!   - -10 -00:00:59,600 --> 00:01:04,880 -Even languages with a lot of different characters  -like the Chinese languages have dictionaries with   - -11 -00:01:06,160 --> 00:01:14,160 -~20,000 different characters but more than 375,000  -different words. Character-based vocabularies   - -12 -00:01:14,160 --> 00:01:20,240 -let us fewer different tokens than the word-based  -tokenization dictionaries we would otherwise use.   - -13 -00:01:23,040 --> 00:01:28,000 -These vocabularies are also more complete than  -their word-based vocabularies counterparts.   - -14 -00:01:28,720 --> 00:01:34,160 -As our vocabulary contains all characters used  -in a language, even words unseen during the   - -15 -00:01:34,160 --> 00:01:39,840 -tokenizer training can still be tokenized, so  -out-of-vocabulary tokens will be less frequent.   - -16 -00:01:40,480 --> 00:01:45,200 -This includes the ability to correctly tokenize  -misspelled words, rather than discarding them as   - -17 -00:01:45,200 --> 00:01:53,600 -unknown straight away. However, this algorithm  -isn't perfect either! Intuitively, characters   - -18 -00:01:53,600 --> 00:01:59,760 -do not hold as much information individually as  -a word would hold. For example, "Let's" holds   - -19 -00:01:59,760 --> 00:02:07,040 -more information than "l". Of course, this is not  -true for all languages, as some languages like   - -20 -00:02:07,040 --> 00:02:11,280 -ideogram-based languages have a lot of  -information held in single characters,   - -21 -00:02:12,480 --> 00:02:17,200 -but for others like roman-based languages,  -the model will have to make sense of multiple   - -22 -00:02:17,200 --> 00:02:25,120 -tokens at a time to get the information held in  -a single word. This leads to another issue with   - -23 -00:02:25,120 --> 00:02:30,320 -character-based tokenizers: their sequences are  -translated into very large amount of tokens to be   - -24 -00:02:30,320 --> 00:02:37,680 -processed by the model. This can have an impact  -on the size of the context the model will carry   - -25 -00:02:37,680 --> 00:02:45,120 -around, and will reduce the size of the text we  -can use as input for our model. This tokenization,   - -26 -00:02:45,120 --> 00:02:49,920 -while it has some issues, has seen some very good  -results in the past and should be considered when   - -27 -00:02:49,920 --> 00:03:00,720 -approaching a new problem as it solves some  -issues encountered in the word-based algorithm. +1 +00:00:00,234 --> 00:00:02,901 +(page whirring) + +2 +00:00:04,260 --> 00:00:07,200 +- Before diving in +character-based tokenization, + +3 +00:00:07,200 --> 00:00:10,350 +understanding why this kind +of tokenization is interesting + +4 +00:00:10,350 --> 00:00:13,533 +requires understanding the flaws +of word-based tokenization. + +5 +00:00:14,640 --> 00:00:16,320 +If you haven't seen the first video + +6 +00:00:16,320 --> 00:00:17,880 +on word-based tokenization + +7 +00:00:17,880 --> 00:00:21,450 +we recommend you check it out +before looking at this video. + +8 +00:00:21,450 --> 00:00:24,250 +Okay, let's take a look at +character-based tokenization. + +9 +00:00:25,650 --> 00:00:28,560 +We now split our text into +individual characters, + +10 +00:00:28,560 --> 00:00:29,673 +rather than words. + +11 +00:00:32,850 --> 00:00:35,550 +There are generally a lot of +different words in languages, + +12 +00:00:35,550 --> 00:00:37,743 +while the number of characters stays low. + +13 +00:00:38,610 --> 00:00:41,313 +To begin let's take a look +at the English language, + +14 +00:00:42,210 --> 00:00:45,540 +it has an estimated +170,000 different words, + +15 +00:00:45,540 --> 00:00:47,730 +so we would need a very large vocabulary + +16 +00:00:47,730 --> 00:00:49,413 +to encompass all words. + +17 +00:00:50,280 --> 00:00:52,200 +With a character-based vocabulary, + +18 +00:00:52,200 --> 00:00:55,440 +we can get by with only 256 characters, + +19 +00:00:55,440 --> 00:00:58,683 +which includes letters, +numbers and special characters. + +20 +00:00:59,760 --> 00:01:02,190 +Even languages with a lot +of different characters + +21 +00:01:02,190 --> 00:01:04,800 +like the Chinese languages +can have dictionaries + +22 +00:01:04,800 --> 00:01:08,130 +with up to 20,000 different characters + +23 +00:01:08,130 --> 00:01:11,523 +but more than 375,000 different words. + +24 +00:01:12,480 --> 00:01:14,310 +So character-based vocabularies + +25 +00:01:14,310 --> 00:01:16,293 +let us use fewer different tokens + +26 +00:01:16,293 --> 00:01:19,050 +than the word-based +tokenization dictionaries + +27 +00:01:19,050 --> 00:01:20,523 +we would otherwise use. + +28 +00:01:23,250 --> 00:01:25,830 +These vocabularies are also more complete + +29 +00:01:25,830 --> 00:01:28,950 +than their word-based +vocabularies counterparts. + +30 +00:01:28,950 --> 00:01:31,410 +As our vocabulary contains all characters + +31 +00:01:31,410 --> 00:01:33,960 +used in a language, even words unseen + +32 +00:01:33,960 --> 00:01:36,990 +during the tokenizer training +can still be tokenized, + +33 +00:01:36,990 --> 00:01:39,633 +so out-of-vocabulary tokens +will be less frequent. + +34 +00:01:40,680 --> 00:01:42,840 +This includes the ability +to correctly tokenize + +35 +00:01:42,840 --> 00:01:45,210 +misspelled words, rather +than discarding them + +36 +00:01:45,210 --> 00:01:46,623 +as unknown straight away. + +37 +00:01:48,240 --> 00:01:52,380 +However, this algorithm +isn't perfect either. + +38 +00:01:52,380 --> 00:01:54,360 +Intuitively, characters do not hold + +39 +00:01:54,360 --> 00:01:57,990 +as much information individually +as a word would hold. + +40 +00:01:57,990 --> 00:02:00,930 +For example, "Let's" +holds more information + +41 +00:02:00,930 --> 00:02:03,570 +than it's first letter "l". + +42 +00:02:03,570 --> 00:02:05,880 +Of course, this is not +true for all languages, + +43 +00:02:05,880 --> 00:02:08,880 +as some languages like +ideogram-based languages + +44 +00:02:08,880 --> 00:02:11,523 +have a lot of information +held in single characters, + +45 +00:02:12,750 --> 00:02:15,360 +but for others like roman-based languages, + +46 +00:02:15,360 --> 00:02:17,760 +the model will have to make +sense of multiple tokens + +47 +00:02:17,760 --> 00:02:20,670 +at a time to get the +information otherwise held + +48 +00:02:20,670 --> 00:02:21,753 +in a single word. + +49 +00:02:23,760 --> 00:02:27,000 +This leads to another issue +with character-based tokenizers, + +50 +00:02:27,000 --> 00:02:29,520 +their sequences are translated +into very large amount + +51 +00:02:29,520 --> 00:02:31,593 +of tokens to be processed by the model. + +52 +00:02:33,090 --> 00:02:36,810 +And this can have an impact +on the size of the context + +53 +00:02:36,810 --> 00:02:40,020 +the model will carry around, +and will reduce the size + +54 +00:02:40,020 --> 00:02:42,030 +of the text we can use +as input for our model, + +55 +00:02:42,030 --> 00:02:43,233 +which is often limited. + +56 +00:02:44,100 --> 00:02:46,650 +This tokenization, while +it has some issues, + +57 +00:02:46,650 --> 00:02:48,720 +has seen some very good +results in the past + +58 +00:02:48,720 --> 00:02:50,490 +and so it should be +considered when approaching + +59 +00:02:50,490 --> 00:02:52,680 +a new problem as it solves issues + +60 +00:02:52,680 --> 00:02:54,843 +encountered in the word-based algorithm. + +61 +00:02:56,107 --> 00:02:58,774 +(page whirring) + diff --git a/subtitles/en/15_subword-based-tokenizers.srt b/subtitles/en/15_subword-based-tokenizers.srt index 49a3d6d2d..6c4ef7ff3 100644 --- a/subtitles/en/15_subword-based-tokenizers.srt +++ b/subtitles/en/15_subword-based-tokenizers.srt @@ -1,159 +1,323 @@ -1 -00:00:06,320 --> 00:00:11,440 -Let's take a look at subword-based tokenization.  -Understanding why subword-based tokenization   - -2 -00:00:11,440 --> 00:00:16,320 -is interesting requires understanding the flaws  -of word-based and character-based tokenization.   - -3 -00:00:17,200 --> 00:00:21,760 -If you haven't seen the first videos on  -word-based and character-based tokenization,   - -4 -00:00:21,760 --> 00:00:24,400 -we recommend you check them out  -before looking at this video.   - -5 -00:00:27,680 --> 00:00:33,440 -Subword-tokenization lies in between  -character-based and word-based tokenization   - -6 -00:00:33,440 --> 00:00:40,960 -algorithms. The idea is to find a middle ground  -between very large vocabularies, large quantity of   - -7 -00:00:40,960 --> 00:00:47,040 -out-of-vocabulary tokens, loss of meaning across  -very similar words, for word-based tokenizers,   - -8 -00:00:47,040 --> 00:00:52,800 -and very long sequences, less meaningful  -individual tokens for character-based tokenizers.   - -9 -00:00:54,720 --> 00:00:59,360 -These algorithms rely on the following  -principle: frequently used words should not   - -10 -00:00:59,360 --> 00:01:04,800 -be split into smaller subwords, but rare words  -should be decomposed into meaningful subwords.   - -11 -00:01:06,320 --> 00:01:11,520 -An example is the word dog: we would like to have  -our tokenizer to have a single ID for the word   - -12 -00:01:11,520 --> 00:01:18,480 -dog, rather than splitting it into characters:  -d, o, and g. However, when encountering the word   - -13 -00:01:18,480 --> 00:01:23,920 -dogs, we would like our tokenizer to understand  -that at the root, this is still the word dog,   - -14 -00:01:23,920 --> 00:01:31,280 -with an added s while slightly changes the meaning  -while keeping the original idea. Another example   - -15 -00:01:31,280 --> 00:01:37,520 -is a complex word like tokenization, which can  -be split into meaningful subwords. The root of   - -16 -00:01:37,520 --> 00:01:42,000 -the word is token, and ization completes the  -root to give it a slightly different meaning.   - -17 -00:01:42,720 --> 00:01:48,960 -It makes sense to split the word into two: token,  -as the root of the word (labeled as the "start" of   - -18 -00:01:48,960 --> 00:01:53,840 -the word). ization as additional information  -(labeled as a "completion" of the word).   - -19 -00:01:56,240 --> 00:02:00,320 -In turn, the model will now be able to make  -sense of token in different situations.   - -20 -00:02:00,880 --> 00:02:06,400 -It will understand that the words token, tokens,  -tokenizing, and tokenization are linked and have   - -21 -00:02:06,400 --> 00:02:14,000 -a similar meaning. It will also understand that  -tokenization, modernization, and immunization,   - -22 -00:02:14,000 --> 00:02:18,960 -which all have the same suffixes, are probably  -used in the same syntactic situations.   - -23 -00:02:20,320 --> 00:02:25,920 -Subword-based tokenizers generally have a way  -to identify which tokens are start of words, and   - -24 -00:02:25,920 --> 00:02:34,320 -which tokens complete start of words: token as the  -start of a word. ##ization as completing a word.   - -25 -00:02:34,960 --> 00:02:40,800 -Here the ## prefix indicates that ization is  -part of a word rather than the beginning of it.   - -26 -00:02:41,760 --> 00:02:49,440 -The ## comes from the BERT tokenizer, based on the  -WordPiece algorithm. Other tokenizers use other   - -27 -00:02:49,440 --> 00:02:54,720 -prefixes, which can be placed to indicate part of  -words like seen here, or start of words instead!   - -28 -00:02:56,000 --> 00:03:01,040 -There are a lot of different algorithms that can  -be used for subword tokenization, and most models   - -29 -00:03:01,040 --> 00:03:05,760 -obtaining state-of-the-art results in English  -today use some kind of subword-tokenization   - -30 -00:03:05,760 --> 00:03:12,320 -algorithm. These approaches help in reducing  -the vocabulary sizes by sharing information   - -31 -00:03:12,320 --> 00:03:17,840 -across different words, having the ability to  -have prefixes and suffixes understood as such.   - -32 -00:03:18,480 --> 00:03:27,760 -They keep meaning across very similar words,  -by recognizing similar tokens making them up. +1 +00:00:06,450 --> 00:00:09,540 +- Let's take a look at +subword based tokenization. + +2 +00:00:09,540 --> 00:00:11,610 +Understanding why subword +based tokenization is + +3 +00:00:11,610 --> 00:00:13,980 +interesting requires +understanding the flaws + +4 +00:00:13,980 --> 00:00:17,340 +of word based and corrector +based tokenization. + +5 +00:00:17,340 --> 00:00:18,780 +If you haven't seen the first videos + +6 +00:00:18,780 --> 00:00:22,020 +on word based and character +based tokenization + +7 +00:00:22,020 --> 00:00:23,130 +we recommend you check them + +8 +00:00:23,130 --> 00:00:24,780 +out before looking at this video. + +9 +00:00:27,840 --> 00:00:31,493 +Subword based tokenization +lies in between character based + +10 +00:00:31,493 --> 00:00:35,280 +and word based tokenization algorithms. + +11 +00:00:35,280 --> 00:00:37,410 +The idea is to find a middle ground + +12 +00:00:37,410 --> 00:00:39,486 +between very large vocabularies + +13 +00:00:39,486 --> 00:00:42,600 +a large quantity of out vocabulary tokens + +14 +00:00:42,600 --> 00:00:45,360 +and a loss of meaning +across very similar words + +15 +00:00:45,360 --> 00:00:48,630 +for word based tokenizers +and very long sequences + +16 +00:00:48,630 --> 00:00:51,330 +as well as less meaningful +individual tokens. + +17 +00:00:51,330 --> 00:00:53,133 +For character based tokenizers. + +18 +00:00:54,840 --> 00:00:57,960 +These algorithms rely on +the following principle. + +19 +00:00:57,960 --> 00:01:00,000 +Frequently used words should not be split + +20 +00:01:00,000 --> 00:01:01,500 +into smaller subwords + +21 +00:01:01,500 --> 00:01:03,433 +while rare words should be decomposed + +22 +00:01:03,433 --> 00:01:05,103 +into meaningful subwords. + +23 +00:01:06,510 --> 00:01:08,460 +An example is the word dog. + +24 +00:01:08,460 --> 00:01:11,190 +We would like to have our +tokenizer to have a single ID + +25 +00:01:11,190 --> 00:01:12,600 +for the word dog rather + +26 +00:01:12,600 --> 00:01:15,363 +than splitting it into +correctors D O and G. + +27 +00:01:16,650 --> 00:01:19,260 +However, when encountering the word dogs + +28 +00:01:19,260 --> 00:01:22,710 +we would like our tokenize to +understand that at the root + +29 +00:01:22,710 --> 00:01:24,120 +this is still the word dog. + +30 +00:01:24,120 --> 00:01:27,030 +With an added S, that +slightly changes the meaning + +31 +00:01:27,030 --> 00:01:28,923 +while keeping the original idea. + +32 +00:01:30,600 --> 00:01:34,080 +Another example is a complex +word like tokenization + +33 +00:01:34,080 --> 00:01:37,140 +which can be split into +meaningful subwords. + +34 +00:01:37,140 --> 00:01:37,973 +The root + +35 +00:01:37,973 --> 00:01:40,590 +of the word is token and +-ization completes the root + +36 +00:01:40,590 --> 00:01:42,870 +to give it a slightly different meaning. + +37 +00:01:42,870 --> 00:01:44,430 +It makes sense to split the word + +38 +00:01:44,430 --> 00:01:47,640 +into two, token as the root of the word, + +39 +00:01:47,640 --> 00:01:49,950 +labeled as the start of the word + +40 +00:01:49,950 --> 00:01:52,530 +and ization as additional +information labeled + +41 +00:01:52,530 --> 00:01:54,393 +as a completion of the word. + +42 +00:01:55,826 --> 00:01:58,740 +In turn, the model will +now be able to make sense + +43 +00:01:58,740 --> 00:02:01,080 +of token in different situations. + +44 +00:02:01,080 --> 00:02:04,602 +It will understand that the +word's token, tokens, tokenizing + +45 +00:02:04,602 --> 00:02:08,760 +and tokenization have a +similar meaning and are linked. + +46 +00:02:08,760 --> 00:02:12,450 +It's will also understand that +tokenization, modernization + +47 +00:02:12,450 --> 00:02:16,200 +and immunization, which +all have the same suffixes + +48 +00:02:16,200 --> 00:02:19,383 +are probably used in the +same syntactic situations. + +49 +00:02:20,610 --> 00:02:23,130 +Subword based tokenizers +generally have a way to + +50 +00:02:23,130 --> 00:02:25,890 +identify which tokens are a start of word + +51 +00:02:25,890 --> 00:02:28,443 +and which tokens complete start of words. + +52 +00:02:29,520 --> 00:02:31,140 +So here token as the start + +53 +00:02:31,140 --> 00:02:35,100 +of a ward and hash hash +ization as completion of award. + +54 +00:02:35,100 --> 00:02:38,103 +Here, the hash hash prefix +indicates that ization is part + +55 +00:02:38,103 --> 00:02:41,013 +of award rather than the beginning of it. + +56 +00:02:41,910 --> 00:02:43,110 +The hash hash comes + +57 +00:02:43,110 --> 00:02:47,013 +from the BERT tokenizer based +on the word piece algorithm. + +58 +00:02:47,850 --> 00:02:50,700 +Other tokenizes use other +prefixes which can be + +59 +00:02:50,700 --> 00:02:52,200 +placed to indicate part of words + +60 +00:02:52,200 --> 00:02:55,083 +like in here or start of words instead. + +61 +00:02:56,250 --> 00:02:57,083 +There are a lot + +62 +00:02:57,083 --> 00:02:58,740 +of different algorithms that can be used + +63 +00:02:58,740 --> 00:03:00,090 +for subword tokenization + +64 +00:03:00,090 --> 00:03:02,670 +and most models obtaining +state-of-the-art results + +65 +00:03:02,670 --> 00:03:03,780 +in English today + +66 +00:03:03,780 --> 00:03:06,663 +use some kind of subword +tokenization algorithms. + +67 +00:03:07,620 --> 00:03:10,953 +These approaches help in +reducing the vocabulary sizes + +68 +00:03:10,953 --> 00:03:13,636 +by sharing information +across different words + +69 +00:03:13,636 --> 00:03:15,960 +having the ability to have prefixes + +70 +00:03:15,960 --> 00:03:18,630 +and suffixes understood as such. + +71 +00:03:18,630 --> 00:03:20,700 +They keep meaning across +very similar words + +72 +00:03:20,700 --> 00:03:23,103 +by recognizing similar +tokens, making them up. + diff --git a/subtitles/en/16_the-tokenization-pipeline.srt b/subtitles/en/16_the-tokenization-pipeline.srt index 537af57bb..f0da01106 100644 --- a/subtitles/en/16_the-tokenization-pipeline.srt +++ b/subtitles/en/16_the-tokenization-pipeline.srt @@ -1,168 +1,339 @@ -1 -00:00:05,440 --> 00:00:12,320 -The tokenizer pipeline. In this video, we'll look  -at how a tokenizer converts raw text to numbers   - -2 -00:00:12,320 --> 00:00:18,080 -that a Transformer model can make sense of,  -like when we execute this code. Here is a quick   - -3 -00:00:18,080 --> 00:00:24,400 -overview of what happens inside the tokenizer  -object: first the text is split into tokens, which   - -4 -00:00:24,400 --> 00:00:31,280 -are words, parts of words, or punctuation symbols.  -Then the tokenizer adds potential special tokens   - -5 -00:00:31,280 --> 00:00:36,560 -and converts each token to their unique respective  -ID as defined by the tokenizer's vocabulary.   - -6 -00:00:37,520 --> 00:00:41,440 -As we'll see it doesn't actually happen  -in this order, but viewing it like this   - -7 -00:00:41,440 --> 00:00:46,320 -is better for understanding what happens.  -The first step is to split our input text   - -8 -00:00:46,320 --> 00:00:53,840 -into tokens with the tokenize method. To do this,  -the tokenizer may first perform some operations   - -9 -00:00:53,840 --> 00:00:58,000 -like lowercasing all words, then follow a  -set of rules to split the result in small   - -10 -00:00:58,000 --> 00:01:03,520 -chunks of text. Most of the Transformers  -models use a subword tokenization algorithm,   - -11 -00:01:04,160 --> 00:01:08,720 -which means that one given word can be  -split in several tokens, like tokenize   - -12 -00:01:08,720 --> 00:01:13,360 -here. Look at the "Tokenization algorithms"  -videos linked below for more information!   - -13 -00:01:14,480 --> 00:01:19,600 -The ## prefix we see in front of ize is  -the convention used by BERT to indicate   - -14 -00:01:19,600 --> 00:01:26,080 -this token is not the beginning of a word. Other  -tokenizers may use different conventions however:   - -15 -00:01:26,080 --> 00:01:31,040 -for instance ALBERT tokenizers will add  -a long underscore in front of all the   - -16 -00:01:31,040 --> 00:01:36,640 -tokens that had a space before them, which is  -a convention used by sentencepiece tokenizers.   - -17 -00:01:38,320 --> 00:01:43,280 -The second step of the tokenization pipeline  -is to map those tokens to their respective IDs   - -18 -00:01:43,280 --> 00:01:48,960 -as defined by the vocabulary of the tokenizer.  -This is why we need to download a file when we   - -19 -00:01:48,960 --> 00:01:53,600 -instantiate a tokenizer with the from_pretrained  -method: we have to make sure we use the same   - -20 -00:01:53,600 --> 00:01:59,520 -mapping as when the model was pretrained. To do  -this, we use the convert_tokens_to_ids method.   - -21 -00:02:00,720 --> 00:02:05,360 -You may have noticed that we don't have the  -exact same result as in our first slide — or not,   - -22 -00:02:05,360 --> 00:02:09,840 -as this looks like a list of random numbers,  -in which case allow me to refresh your memory.   - -23 -00:02:10,480 --> 00:02:13,680 -We had a number at the beginning  -and at the end that are missing,   - -24 -00:02:14,400 --> 00:02:20,160 -those are the special tokens. The special tokens  -are added by the prepare_for_model method,   - -25 -00:02:20,160 --> 00:02:25,280 -which knows the indices of those tokens in the  -vocabulary and just adds the proper numbers.   - -26 -00:02:28,320 --> 00:02:32,480 -You can look at the special tokens (and more  -generally at how the tokenizer has changed   - -27 -00:02:32,480 --> 00:02:37,120 -your text) by using the decode method  -on the outputs of the tokenizer object.   - -28 -00:02:38,240 --> 00:02:44,080 -As for the prefix for beginning of words/part  -of words, those special tokens vary depending on   - -29 -00:02:44,080 --> 00:02:50,080 -which tokenizer you are using. The BERT tokenizer  -uses [CLS] and [SEP] but the roberta tokenizer   - -30 -00:02:50,080 --> 00:02:57,520 -uses html-like anchors and . Now that  -you know how the tokenizer works, you can forget   - -31 -00:02:57,520 --> 00:03:02,560 -all those intermediaries methods and only remember  -that you just have to call it on your input texts.   - -32 -00:03:03,600 --> 00:03:06,880 -The inputs don't contain the inputs IDs however,   - -33 -00:03:07,520 --> 00:03:11,600 -to learn what the attention mask is, check  -out the "Batch inputs together" video.   - -34 -00:03:12,160 --> 00:03:17,840 -To learn about token type IDs, look at  -the "Process pairs of sentences" video. +1 +00:00:00,479 --> 00:00:03,396 +(object whooshing) + +2 +00:00:05,610 --> 00:00:06,873 +- The tokenizer pipeline. + +3 +00:00:07,920 --> 00:00:10,570 +In this video, we'll look +at how a tokenizer converts + +4 +00:00:11,433 --> 00:00:12,480 +raw texts to numbers, + +5 +00:00:12,480 --> 00:00:14,970 +that a Transformer +model can make sense of, + +6 +00:00:14,970 --> 00:00:16,520 +like when we execute this code. + +7 +00:00:17,760 --> 00:00:18,690 +Here is a quick overview + +8 +00:00:18,690 --> 00:00:21,630 +of what happens inside +the tokenizer object: + +9 +00:00:21,630 --> 00:00:24,360 +first, the text is split into tokens, + +10 +00:00:24,360 --> 00:00:27,453 +which are words, parts of +words, or punctuation symbols. + +11 +00:00:28,440 --> 00:00:31,500 +Then the tokenizer adds +potential special tokens + +12 +00:00:31,500 --> 00:00:34,680 +and converts each token to +their unique respective ID + +13 +00:00:34,680 --> 00:00:36,843 +as defined by the tokenizer's vocabulary. + +14 +00:00:37,710 --> 00:00:40,380 +As we'll see, it doesn't +quite happen in this order, + +15 +00:00:40,380 --> 00:00:43,233 +but doing it like this is +better for understandings. + +16 +00:00:44,280 --> 00:00:47,670 +The first step is to split +our input text into tokens. + +17 +00:00:47,670 --> 00:00:49,653 +We use the tokenize method for this. + +18 +00:00:50,550 --> 00:00:54,030 +To do that, the tokenizer may +first perform some operations, + +19 +00:00:54,030 --> 00:00:56,880 +like lowercasing all words, +then follow a set of rules + +20 +00:00:56,880 --> 00:00:59,283 +to split the result in +small chunks of text. + +21 +00:01:00,480 --> 00:01:02,286 +Most of the Transformer models uses + +22 +00:01:02,286 --> 00:01:04,890 +a word tokenization algorithm, which means + +23 +00:01:04,890 --> 00:01:06,750 +that one given word can be split + +24 +00:01:06,750 --> 00:01:10,050 +in several tokens like tokenize here. + +25 +00:01:10,050 --> 00:01:12,570 +Look at the "Tokenization +algorithms" video link below + +26 +00:01:12,570 --> 00:01:13,743 +for more information. + +27 +00:01:14,760 --> 00:01:17,820 +The # # prefix we see in front of ize is + +28 +00:01:17,820 --> 00:01:19,830 +a convention used by Bert to indicate + +29 +00:01:19,830 --> 00:01:22,762 +this token is not the +beginning of the word. + +30 +00:01:22,762 --> 00:01:26,310 +Other tokenizers may use +different conventions however: + +31 +00:01:26,310 --> 00:01:29,984 +for instance, ALBERT tokenizers +will add a long underscore + +32 +00:01:29,984 --> 00:01:31,620 +in front of all the tokens + +33 +00:01:31,620 --> 00:01:34,920 +that added space before them, +which is a convention shared + +34 +00:01:34,920 --> 00:01:37,700 +by all sentencepiece tokenizers. + +35 +00:01:38,580 --> 00:01:41,040 +The second step of the +tokenization pipeline is + +36 +00:01:41,040 --> 00:01:43,470 +to map those tokens to +their respective IDs + +37 +00:01:43,470 --> 00:01:45,770 +as defined by the +vocabulary of the tokenizer. + +38 +00:01:46,770 --> 00:01:48,690 +This is why we need to download the file + +39 +00:01:48,690 --> 00:01:50,580 +when we instantiate a tokenizer + +40 +00:01:50,580 --> 00:01:52,400 +with the from_pretrained method. + +41 +00:01:52,400 --> 00:01:54,390 +We have to make sure +we use the same mapping + +42 +00:01:54,390 --> 00:01:56,520 +as when the model was pretrained. + +43 +00:01:56,520 --> 00:01:59,703 +To do this, we use the +convert_tokens_to_ids method. + +44 +00:02:01,050 --> 00:02:01,883 +We may have noticed + +45 +00:02:01,883 --> 00:02:03,540 +that we don't have the exact same results + +46 +00:02:03,540 --> 00:02:05,580 +as in our first slide, or not + +47 +00:02:05,580 --> 00:02:07,920 +as this looks like a list +of random numbers anyway, + +48 +00:02:07,920 --> 00:02:10,680 +in which case, allow me +to refresh your memory. + +49 +00:02:10,680 --> 00:02:12,350 +We had a the number at +the beginning and a number + +50 +00:02:12,350 --> 00:02:17,130 +at the end that are missing, +those are the special tokens. + +51 +00:02:17,130 --> 00:02:20,340 +The special tokens are added +by the prepare_for_model method + +52 +00:02:20,340 --> 00:02:22,350 +which knows the indices of this token + +53 +00:02:22,350 --> 00:02:25,680 +in the vocabulary and just +adds the proper numbers. + +54 +00:02:25,680 --> 00:02:27,243 +in the input IDs list. + +55 +00:02:28,590 --> 00:02:29,541 +You can look at the special tokens + +56 +00:02:29,541 --> 00:02:30,990 +and, more generally, + +57 +00:02:30,990 --> 00:02:33,870 +at how the tokenizer +has changed your text, + +58 +00:02:33,870 --> 00:02:35,280 +by using the decode method + +59 +00:02:35,280 --> 00:02:37,503 +on the outputs of the tokenizer object. + +60 +00:02:38,490 --> 00:02:39,423 +As for the prefix for beginning + +61 +00:02:39,423 --> 00:02:44,160 +of words/ part of words, for +special tokens vary depending + +62 +00:02:44,160 --> 00:02:46,500 +on which tokenizer you're using. + +63 +00:02:46,500 --> 00:02:48,810 +So that tokenizer uses CLS and SEP, + +64 +00:02:48,810 --> 00:02:52,417 +but the roberta tokenizer +uses HTML-like anchors + +65 +00:02:52,417 --> 00:02:55,230 + and . + +66 +00:02:55,230 --> 00:02:57,090 +Now that you know how the tokenizer works, + +67 +00:02:57,090 --> 00:02:59,390 +you can forget all those +intermediate methods, + +68 +00:03:00,283 --> 00:03:01,650 +and then you remember that +you just have to call it + +69 +00:03:01,650 --> 00:03:02,913 +on your input texts. + +70 +00:03:03,870 --> 00:03:05,310 +The output of a tokenizer don't + +71 +00:03:05,310 --> 00:03:07,853 +just contain the input IDs, however. + +72 +00:03:07,853 --> 00:03:09,750 +To learn what the attention mask is, + +73 +00:03:09,750 --> 00:03:12,360 +check out the "Batch +input together" video. + +74 +00:03:12,360 --> 00:03:14,220 +To learn about token type IDs, + +75 +00:03:14,220 --> 00:03:16,570 +look at the "Process +pairs of sentences" video. + +76 +00:03:18,003 --> 00:03:20,920 +(object whooshing) + diff --git a/subtitles/en/17_batching-inputs-together-(pytorch).srt b/subtitles/en/17_batching-inputs-together-(pytorch).srt index dd1115253..ca7f696c8 100644 --- a/subtitles/en/17_batching-inputs-together-(pytorch).srt +++ b/subtitles/en/17_batching-inputs-together-(pytorch).srt @@ -1,144 +1,291 @@ -1 -00:00:05,200 --> 00:00:10,880 -How to batch inputs together? In this video, we  -will see how to batch input sequences together.   - -2 -00:00:12,320 --> 00:00:16,560 -In general, the sentences we want to pass through  -our model won't all have the same lengths.   - -3 -00:00:17,520 --> 00:00:21,280 -Here we are using the model we saw  -in the sentiment analysis pipeline   - -4 -00:00:21,840 --> 00:00:26,800 -and want to classify two sentences.  -When tokenizing them and mapping each   - -5 -00:00:26,800 --> 00:00:31,280 -token to its corresponding input IDs,  -we get two lists of different lengths.   - -6 -00:00:33,040 --> 00:00:38,400 -Trying to create a tensor or a NumPy array from  -those two lists will result in an error, because   - -7 -00:00:38,400 --> 00:00:44,560 -all arrays and tensors should be rectangular.  -One way to overcome this limit is to make the   - -8 -00:00:44,560 --> 00:00:50,160 -second sentence the same length as the first by  -adding a special token as many times as necessary.   - -9 -00:00:51,360 --> 00:00:55,760 -Another way would be to truncate the first  -sequence to the length of the second, but we   - -10 -00:00:55,760 --> 00:01:00,720 -would them lose a lot of information that might  -be necessary to properly classify the sentence.   - -11 -00:01:02,000 --> 00:01:06,720 -In general, we only truncate sentences when  -they are longer than the maximum length the   - -12 -00:01:06,720 --> 00:01:14,000 -model can handle. The value used to pad the second  -sentence should not be picked randomly: the model   - -13 -00:01:14,000 --> 00:01:19,200 -has been pretrained with a certain padding ID,  -which you can find in tokenizer.pad_token_id.   - -14 -00:01:20,800 --> 00:01:25,200 -Now that we have padded our sentences,  -we can make a batch with them. If   - -15 -00:01:25,200 --> 00:01:29,840 -we pass the two sentences to the model  -separately and batched together however,   - -16 -00:01:29,840 --> 00:01:35,120 -we notice that we don't get the same results for  -the sentence that is padded (here the second one).   - -17 -00:01:39,120 --> 00:01:42,880 -If you remember that Transformer models make  -heavy use of attention layers, this should   - -18 -00:01:42,880 --> 00:01:47,760 -not come as a total surprise: when computing  -the contextual representation of each token,   - -19 -00:01:48,560 --> 00:01:54,320 -the attention layers look at all the other words  -in the sentence. If we have just the sentence or   - -20 -00:01:54,320 --> 00:01:58,720 -the sentence with several padding tokens added,  -it's logical we don't get the same values.   - -21 -00:02:00,000 --> 00:02:05,120 -To get the same results with or without padding,  -we need to indicate to the attention layers   - -22 -00:02:05,120 --> 00:02:10,320 -that they should ignore those padding tokens.  -This is done by creating an attention mask,   - -23 -00:02:10,320 --> 00:02:16,560 -a tensor with the same shape as the input  -IDs, with zeros and ones. Ones indicate the   - -24 -00:02:16,560 --> 00:02:21,840 -tokens the attention layers should consider in the  -context and zeros the tokens they should ignore.   - -25 -00:02:23,360 --> 00:02:26,560 -Now passing this attention  -mask along with the input ids   - -26 -00:02:26,560 --> 00:02:30,720 -will give us the same results as when we sent  -the two sentences individually to the model!   - -27 -00:02:32,160 --> 00:02:36,640 -This is all done behind the scenes by the  -tokenizer when you apply it to several sentences   - -28 -00:02:36,640 --> 00:02:41,280 -with the flag padding=True. It will  -apply the padding with the proper value   - -29 -00:02:41,280 --> 00:02:49,840 -to the smaller sentences and create  -the appropriate attention mask. +1 +00:00:00,373 --> 00:00:02,956 +(subtle blast) + +2 +00:00:05,400 --> 00:00:07,590 +- How to batch inputs together. + +3 +00:00:07,590 --> 00:00:09,240 +In this video, we will see how + +4 +00:00:09,240 --> 00:00:11,073 +to batch input sequences together. + +5 +00:00:12,137 --> 00:00:15,420 +In general, the sentences we +want to pass through our model + +6 +00:00:15,420 --> 00:00:17,670 +won't all have the same lengths. + +7 +00:00:17,670 --> 00:00:19,740 +Here, we are using the model we saw + +8 +00:00:19,740 --> 00:00:22,080 +in the sentiment analysis pipeline + +9 +00:00:22,080 --> 00:00:24,063 +and want to classify two sentences. + +10 +00:00:24,900 --> 00:00:27,360 +When tokenizing them +and mapping each token + +11 +00:00:27,360 --> 00:00:29,610 +to its corresponding input IDs, + +12 +00:00:29,610 --> 00:00:31,593 +we get two lists of different lengths. + +13 +00:00:33,240 --> 00:00:35,340 +Trying to create a tensor or a NumPy array + +14 +00:00:35,340 --> 00:00:38,220 +from those two lists +will result in an error, + +15 +00:00:38,220 --> 00:00:41,043 +because all arrays and +tensors should be rectangular. + +16 +00:00:42,240 --> 00:00:44,160 +One way to overcome this limit + +17 +00:00:44,160 --> 00:00:45,690 +is to make the second sentence + +18 +00:00:45,690 --> 00:00:47,640 +the same length as the first + +19 +00:00:47,640 --> 00:00:50,463 +by adding a special token +as many times as necessary. + +20 +00:00:51,600 --> 00:00:53,970 +Another way would be to +truncate the first sequence + +21 +00:00:53,970 --> 00:00:55,710 +to the length of the second, + +22 +00:00:55,710 --> 00:00:58,140 +but we would them lose +a lot of information + +23 +00:00:58,140 --> 00:01:01,083 +that might be necessary to +properly classify the sentence. + +24 +00:01:02,190 --> 00:01:04,830 +In general, we only truncate sentences + +25 +00:01:04,830 --> 00:01:06,840 +when they are longer +than the maximum length + +26 +00:01:06,840 --> 00:01:08,073 +the model can handle. + +27 +00:01:09,720 --> 00:01:11,850 +The value used to pad the second sentence + +28 +00:01:11,850 --> 00:01:13,740 +should not be picked randomly; + +29 +00:01:13,740 --> 00:01:16,680 +the model has been pretrained +with a certain padding ID, + +30 +00:01:16,680 --> 00:01:19,533 +which you can find in +tokenizer.pad_token_id. + +31 +00:01:21,090 --> 00:01:22,800 +Now that we have padded our sentences, + +32 +00:01:22,800 --> 00:01:24,303 +we can make a batch with them. + +33 +00:01:25,380 --> 00:01:28,320 +If we pass the two sentences +to the model separately + +34 +00:01:28,320 --> 00:01:30,120 +and batched together however, + +35 +00:01:30,120 --> 00:01:32,100 +we notice that we don't +get the same results + +36 +00:01:32,100 --> 00:01:34,060 +for the sentence that is padded, + +37 +00:01:34,060 --> 00:01:35,403 +here, the second one. + +38 +00:01:36,390 --> 00:01:39,420 +It's at the back in the +Transformers Library? No. + +39 +00:01:39,420 --> 00:01:40,770 +If you remember that Transformer models + +40 +00:01:40,770 --> 00:01:42,810 +make heavy use of attention layers, + +41 +00:01:42,810 --> 00:01:45,210 +this should not come as a total surprise; + +42 +00:01:45,210 --> 00:01:48,277 +when computing the contextual +representation of each token, + +43 +00:01:48,277 --> 00:01:50,910 +the attention layers look +at all the other words + +44 +00:01:50,910 --> 00:01:52,410 +in the sentence. + +45 +00:01:52,410 --> 00:01:53,850 +If we have just the sentence + +46 +00:01:53,850 --> 00:01:56,970 +or the sentence with several +padding tokens added, + +47 +00:01:56,970 --> 00:01:59,073 +it's logical we don't get the same values. + +48 +00:02:00,270 --> 00:02:03,030 +To get the same results +with or without padding, + +49 +00:02:03,030 --> 00:02:05,340 +we need to indicate to +the attention layers + +50 +00:02:05,340 --> 00:02:08,070 +that they should ignore +those padding tokens. + +51 +00:02:08,070 --> 00:02:10,620 +This is done by creating +an attention mask, + +52 +00:02:10,620 --> 00:02:13,320 +a tensor with the same +shape as the input IDs, + +53 +00:02:13,320 --> 00:02:14,733 +with zeros and ones. + +54 +00:02:15,780 --> 00:02:18,120 +Ones indicate the tokens +the attention layers + +55 +00:02:18,120 --> 00:02:20,100 +should consider in the context + +56 +00:02:20,100 --> 00:02:22,100 +and zeros the tokens they should ignore. + +57 +00:02:23,520 --> 00:02:26,760 +Now, passing this attention +mask along with the input ID + +58 +00:02:26,760 --> 00:02:28,170 +will give us the same results + +59 +00:02:28,170 --> 00:02:31,170 +as when we sent the two sentences +individually to the model. + +60 +00:02:32,400 --> 00:02:34,950 +This is all done behind +the scenes by the tokenizer + +61 +00:02:34,950 --> 00:02:36,900 +when you apply it to several sentences + +62 +00:02:36,900 --> 00:02:38,613 +with the flag padding=True. + +63 +00:02:39,599 --> 00:02:41,490 +It will apply the padding +with the proper value + +64 +00:02:41,490 --> 00:02:43,140 +to the smaller sentences + +65 +00:02:43,140 --> 00:02:45,423 +and create the appropriate attention mask. + +66 +00:02:46,993 --> 00:02:49,576 +(subtle blast) + diff --git a/subtitles/en/18_batching-inputs-together-(tensorflow).srt b/subtitles/en/18_batching-inputs-together-(tensorflow).srt index 8c78a8f0c..c31449704 100644 --- a/subtitles/en/18_batching-inputs-together-(tensorflow).srt +++ b/subtitles/en/18_batching-inputs-together-(tensorflow).srt @@ -1,138 +1,281 @@ -1 -00:00:05,120 --> 00:00:10,880 -How to batch inputs together? In this video, we  -will see how to batch input sequences together.   - -2 -00:00:12,480 --> 00:00:16,560 -In general, the sentences we want to pass  -through our model won't all have the same   - -3 -00:00:16,560 --> 00:00:23,520 -lengths. Here we are using the model we saw in the  -sentiment analysis pipeline and want to classify   - -4 -00:00:23,520 --> 00:00:29,760 -two sentences. When tokenizing them and mapping  -each token to its corresponding input IDs,   - -5 -00:00:29,760 --> 00:00:31,680 -we get two lists of different lengths.   - -6 -00:00:33,120 --> 00:00:38,240 -Trying to create a tensor or a NumPy array from  -those two lists will result in an error, because   - -7 -00:00:38,240 --> 00:00:44,320 -all arrays and tensors should be rectangular.  -One way to overcome this limit is to make the   - -8 -00:00:44,320 --> 00:00:50,080 -second sentence the same length as the first by  -adding a special token as many times as necessary.   - -9 -00:00:51,040 --> 00:00:55,360 -Another way would be to truncate the first  -sequence to the length of the second, but we   - -10 -00:00:55,360 --> 00:01:00,080 -would them lose a lot of information that might  -be necessary to properly classify the sentence.   - -11 -00:01:01,040 --> 00:01:05,760 -In general, we only truncate sentences when  -they are longer than the maximum length the   - -12 -00:01:05,760 --> 00:01:12,560 -model can handle. The value used to pad the second  -sentence should not be picked randomly: the model   - -13 -00:01:12,560 --> 00:01:18,000 -has been pretrained with a certain padding ID,  -which you can find in tokenizer.pad_token_id.   - -14 -00:01:19,760 --> 00:01:22,640 -Now that we have padded our sentences,  -we can make a batch with them.   - -15 -00:01:23,920 --> 00:01:28,400 -If we pass the two sentences to the model  -separately and batched together however,   - -16 -00:01:28,400 --> 00:01:33,600 -we notice that we don't get the same results for  -the sentence that is padded (here the second one).   - -17 -00:01:37,360 --> 00:01:41,440 -If you remember that Transformer models make  -heavy use of attention layers, this should   - -18 -00:01:41,440 --> 00:01:46,800 -not come as a total surprise: when computing  -the contextual representation of each token,   - -19 -00:01:46,800 --> 00:01:52,800 -the attention layers look at all the other words  -in the sentence. If we have just the sentence or   - -20 -00:01:52,800 --> 00:01:57,200 -the sentence with several padding tokens added,  -it's logical we don't get the same values.   - -21 -00:01:58,560 --> 00:02:03,520 -To get the same results with or without padding,  -we need to indicate to the attention layers   - -22 -00:02:03,520 --> 00:02:08,640 -that they should ignore those padding tokens.  -This is done by creating an attention mask,   - -23 -00:02:08,640 --> 00:02:15,920 -a tensor with the same shape as the input IDs,  -with zeros and ones. Ones indicate the tokens the   - -24 -00:02:15,920 --> 00:02:22,160 -attention layers should consider in the context  -and zeros the tokens they should ignore. Now   - -25 -00:02:22,160 --> 00:02:27,040 -passing this attention mask along with the input  -ids will give us the same results as when we sent   - -26 -00:02:27,040 --> 00:02:33,600 -the two sentences individually to the model! This  -is all done behind the scenes by the tokenizer   - -27 -00:02:33,600 --> 00:02:39,680 -when you apply it to several sentences with the  -flag padding=True. It will apply the padding with   - -28 -00:02:39,680 --> 00:02:49,840 -the proper value to the smaller sentences  -and create the appropriate attention mask. +1 +00:00:00,458 --> 00:00:02,791 +(logo whooshes) + +2 +00:00:05,310 --> 00:00:07,590 +- How to batch inputs together. + +3 +00:00:07,590 --> 00:00:09,150 +In this video, we'll see + +4 +00:00:09,150 --> 00:00:11,050 +how to batch input sequences together. + +5 +00:00:12,630 --> 00:00:14,910 +In general, the sentences we want to pass + +6 +00:00:14,910 --> 00:00:18,000 +through our model won't +all have the same lengths. + +7 +00:00:18,000 --> 00:00:20,310 +Here, we are using the model we saw + +8 +00:00:20,310 --> 00:00:22,650 +in the sentiment analysis pipeline + +9 +00:00:22,650 --> 00:00:24,753 +and want to classify two sentences. + +10 +00:00:25,860 --> 00:00:27,870 +When tokenizing them +and mapping each token + +11 +00:00:27,870 --> 00:00:30,000 +to its corresponding input IDs, + +12 +00:00:30,000 --> 00:00:31,900 +we get two lists of different lengths. + +13 +00:00:33,360 --> 00:00:35,070 +Trying to create a tensor and NumPy array + +14 +00:00:35,070 --> 00:00:38,100 +from those two lists +will result in an error + +15 +00:00:38,100 --> 00:00:40,953 +because all arrays and +tensors should be rectangular. + +16 +00:00:42,510 --> 00:00:43,920 +One way to overcome this limit + +17 +00:00:43,920 --> 00:00:47,340 +is to make the second sentence +the same length as the first + +18 +00:00:47,340 --> 00:00:50,373 +by adding a special token +as many times as necessary. + +19 +00:00:51,300 --> 00:00:53,340 +Another way would be to +truncate the first sequence + +20 +00:00:53,340 --> 00:00:56,550 +to the length of the second, +but we would then lose a lot + +21 +00:00:56,550 --> 00:00:58,590 +of information that may be necessary + +22 +00:00:58,590 --> 00:01:01,230 +to properly classify the sentence. + +23 +00:01:01,230 --> 00:01:04,710 +In general, we only truncate +sentences when they are longer + +24 +00:01:04,710 --> 00:01:07,083 +than the maximum length +the model can handle. + +25 +00:01:08,310 --> 00:01:10,320 +The value used to pad the second sentence + +26 +00:01:10,320 --> 00:01:12,390 +should not be picked randomly. + +27 +00:01:12,390 --> 00:01:15,330 +The model has been pretrained +with a certain padding ID, + +28 +00:01:15,330 --> 00:01:18,093 +which you can find in +tokenizer.pad_token_id. + +29 +00:01:19,950 --> 00:01:21,630 +Now that we have padded our sentences, + +30 +00:01:21,630 --> 00:01:23,130 +we can make a batch with them. + +31 +00:01:24,210 --> 00:01:26,730 +If we pass the two sentences +to the model separately + +32 +00:01:26,730 --> 00:01:29,130 +or batched together, however, we notice + +33 +00:01:29,130 --> 00:01:30,630 +that we don't get the same results + +34 +00:01:30,630 --> 00:01:32,070 +for the sentence that is padded. + +35 +00:01:32,070 --> 00:01:34,440 +Here, the second one. + +36 +00:01:34,440 --> 00:01:36,690 +Expect the word in the +transformer library? + +37 +00:01:36,690 --> 00:01:37,620 +No. + +38 +00:01:37,620 --> 00:01:39,720 +If you remember that Transformer +models make heavy use + +39 +00:01:39,720 --> 00:01:43,800 +of attention layers, it should +not come as a total surprise. + +40 +00:01:43,800 --> 00:01:47,100 +When computing the contextual +representation of each token, + +41 +00:01:47,100 --> 00:01:49,440 +the attention layers look +at all the other words + +42 +00:01:49,440 --> 00:01:51,240 +in the sentence. + +43 +00:01:51,240 --> 00:01:52,252 +If we have just a sentence + +44 +00:01:52,252 --> 00:01:55,650 +or the sentence with several +padding tokens added, + +45 +00:01:55,650 --> 00:01:57,750 +it's logical we don't get the same values. + +46 +00:01:58,830 --> 00:02:01,410 +To get the same results +with or without padding, + +47 +00:02:01,410 --> 00:02:03,750 +we need to indicate to +the attention layers + +48 +00:02:03,750 --> 00:02:06,660 +that they should ignore +those padding tokens. + +49 +00:02:06,660 --> 00:02:08,970 +This is done by creating +an attention mask, + +50 +00:02:08,970 --> 00:02:11,700 +a tensor with the same +shape as the input IDs + +51 +00:02:11,700 --> 00:02:13,173 +with zeros and ones. + +52 +00:02:14,640 --> 00:02:16,830 +Ones indicate the tokens +the attention layers + +53 +00:02:16,830 --> 00:02:18,660 +should consider in the context, + +54 +00:02:18,660 --> 00:02:20,823 +and zeros, the tokens they should ignore. + +55 +00:02:21,810 --> 00:02:23,290 +Now, passing this attention mask + +56 +00:02:23,290 --> 00:02:26,460 +along with the input IDs +will give us the same results + +57 +00:02:26,460 --> 00:02:29,460 +as when we sent the two sentences +individually to the model. + +58 +00:02:30,870 --> 00:02:33,870 +This is all done behind +the scenes by the tokenizer + +59 +00:02:33,870 --> 00:02:35,583 +when you apply it to several sentences + +60 +00:02:35,583 --> 00:02:37,713 +with the flag padding equals true. + +61 +00:02:38,640 --> 00:02:39,690 +It will apply the padding + +62 +00:02:39,690 --> 00:02:42,180 +with the proper value +to the smaller sentences + +63 +00:02:42,180 --> 00:02:44,373 +and create the appropriate attention mask. + diff --git a/subtitles/en/19_hugging-face-datasets-overview-(pytorch).srt b/subtitles/en/19_hugging-face-datasets-overview-(pytorch).srt index 21a4ee3c3..f05f6ceab 100644 --- a/subtitles/en/19_hugging-face-datasets-overview-(pytorch).srt +++ b/subtitles/en/19_hugging-face-datasets-overview-(pytorch).srt @@ -1,164 +1,341 @@ -1 -00:00:05,120 --> 00:00:11,520 -The Hugging Face Datasets library: A Quick  -overview. The Hugging Face Datasets library   - -2 -00:00:11,520 --> 00:00:16,560 -is a library that provides an API to quickly  -download many public datasets and preprocess them.   - -3 -00:00:17,360 --> 00:00:22,560 -In this video we will explore how to do that. The  -downloading part is easy: with the load_dataset   - -4 -00:00:22,560 --> 00:00:28,400 -function, you can directly download and cache a  -dataset from its identifier on the Dataset hub.   - -5 -00:00:29,520 --> 00:00:32,720 -Here we fetch the MRPC dataset  -from the GLUE benchmark,   - -6 -00:00:33,360 --> 00:00:38,320 -which is a dataset containing pairs of sentences  -where the task is to determine the paraphrases.   - -7 -00:00:39,520 --> 00:00:45,440 -The object returned by the load_dataset function  -is a DatasetDict, which is a sort of dictionary   - -8 -00:00:45,440 --> 00:00:51,120 -containing each split of our dataset. We can  -access each split by indexing with its name.   - -9 -00:00:52,000 --> 00:00:57,440 -This split is then an instance of the  -Dataset class, with columns (here sentence1,   - -10 -00:00:57,440 --> 00:01:04,240 -sentence2. label and idx) and rows. We  -can access a given element by its index.   - -11 -00:01:05,200 --> 00:01:10,000 -The amazing thing about the Hugging Face Datasets  -library is that everything is saved to disk   - -12 -00:01:10,000 --> 00:01:15,520 -using Apache Arrow, which means that even if  -your dataset is huge you won't get out of RAM:   - -13 -00:01:16,080 --> 00:01:21,920 -only the elements you request are loaded in  -memory. Accessing a slice of your dataset is   - -14 -00:01:21,920 --> 00:01:26,720 -as easy as one element. The result is then a  -dictionary with list of values for each keys   - -15 -00:01:27,280 --> 00:01:31,600 -(here the list of labels, the list of first  -sentences and the list of second sentences).   - -16 -00:01:33,440 --> 00:01:38,880 -The features attribute of a Dataset gives us more  -information about its columns. In particular,   - -17 -00:01:38,880 --> 00:01:45,280 -we can see here it gives us the correspondence  -between the integers and names for the labels. 0   - -18 -00:01:45,280 --> 00:01:51,760 -stands for not equivalent and 1 for equivalent.  -To preprocess all the elements of our dataset,   - -19 -00:01:51,760 --> 00:01:56,800 -we need to tokenize them. Have a look at the  -video "Preprocess sentence pairs" for a refresher,   - -20 -00:01:57,360 --> 00:02:02,320 -but you just have to send the two sentences to the  -tokenizer with some additional keyword arguments.   - -21 -00:02:03,520 --> 00:02:08,560 -Here we indicate a maximum length of 128  -and pad inputs shorter than this length,   - -22 -00:02:08,560 --> 00:02:14,320 -truncate inputs that are longer. We put all of  -this in a tokenize_function that we can directly   - -23 -00:02:14,320 --> 00:02:20,240 -apply to all the splits in our dataset with the  -map method. As long as the function returns a   - -24 -00:02:20,240 --> 00:02:25,680 -dictionary-like object, the map method will add  -new columns as needed or update existing ones.   - -25 -00:02:27,360 --> 00:02:31,840 -To speed up preprocessing and take advantage  -of the fact our tokenizer is backed by Rust   - -26 -00:02:31,840 --> 00:02:36,880 -thanks to the Hugging Face Tokenizers library, we  -can process several elements at the same time to   - -27 -00:02:36,880 --> 00:02:42,160 -our tokenize function, using the batched=True  -argument. Since the tokenizer can handle list   - -28 -00:02:42,160 --> 00:02:48,880 -of first/second sentences, the tokenize_function  -does not need to change for this. You can also use   - -29 -00:02:49,440 --> 00:02:56,400 -multiprocessing with the map method, check out its  -documentation! Once this is done, we are almost   - -30 -00:02:56,400 --> 00:03:01,920 -ready for training: we just remove the columns we  -don't need anymore with the remove_columns method,   - -31 -00:03:01,920 --> 00:03:06,640 -rename label to labels (since the models  -from Hugging Face Transformers expect that)   - -32 -00:03:07,440 --> 00:03:14,000 -and set the output format to our desired  -backend: torch, tensorflow or numpy. If needed,   - -33 -00:03:14,000 --> 00:03:17,840 -we can also generate a short sample  -of a dataset using the select method. +1 +00:00:00,213 --> 00:00:02,963 +(slide whooshes) + +2 +00:00:05,340 --> 00:00:08,373 +- The Hugging Face Datasets +library, a quick overview. + +3 +00:00:09,990 --> 00:00:11,670 +The Hugging Face Datasets library + +4 +00:00:11,670 --> 00:00:14,310 +is a library that provides +an API to quickly download + +5 +00:00:14,310 --> 00:00:17,610 +many public datasets and preprocess them. + +6 +00:00:17,610 --> 00:00:20,614 +In this video we will +explore how to do that. + +7 +00:00:20,614 --> 00:00:21,780 +The downloading part is easy, + +8 +00:00:21,780 --> 00:00:23,760 +with the load_dataset function. + +9 +00:00:23,760 --> 00:00:26,460 +You can directly download +and cache a dataset + +10 +00:00:26,460 --> 00:00:28,473 +from its identifier on the Dataset hub. + +11 +00:00:29,640 --> 00:00:33,570 +Here, we fetch the MRPC dataset +from the GLUE benchmark, + +12 +00:00:33,570 --> 00:00:36,390 +which is a dataset +containing pairs of sentences + +13 +00:00:36,390 --> 00:00:38,740 +where the task is to +determine the paraphrases. + +14 +00:00:39,810 --> 00:00:42,420 +The object returned by +the load_dataset function + +15 +00:00:42,420 --> 00:00:45,600 +is a DatasetDict, which +is a sort of dictionary + +16 +00:00:45,600 --> 00:00:47,463 +containing each split of our dataset. + +17 +00:00:48,946 --> 00:00:52,170 +We can access each split +by indexing with its name. + +18 +00:00:52,170 --> 00:00:55,047 +This split is then an +instance of the Dataset class, + +19 +00:00:55,047 --> 00:00:58,590 +with columns, here sentence1, sentence2, + +20 +00:00:58,590 --> 00:01:01,233 +label and idx, and rows. + +21 +00:01:02,400 --> 00:01:04,563 +We can access a given +element by its index. + +22 +00:01:05,460 --> 00:01:08,220 +The amazing thing about the +Hugging Face Datasets library + +23 +00:01:08,220 --> 00:01:11,880 +is that everything is saved +to disk using Apache Arrow, + +24 +00:01:11,880 --> 00:01:14,550 +which means that even +if your dataset is huge, + +25 +00:01:14,550 --> 00:01:16,350 +you won't get out of RAM. + +26 +00:01:16,350 --> 00:01:19,113 +Only the elements you +request are loaded in memory. + +27 +00:01:20,340 --> 00:01:23,940 +Accessing a slice of your dataset +is as easy as one element. + +28 +00:01:23,940 --> 00:01:26,220 +The result is then a +dictionary with list of values + +29 +00:01:26,220 --> 00:01:27,480 +for each keys. + +30 +00:01:27,480 --> 00:01:29,070 +Here the list of labels, + +31 +00:01:29,070 --> 00:01:30,147 +the list of first sentences + +32 +00:01:30,147 --> 00:01:31,923 +and the list of second sentences. + +33 +00:01:33,690 --> 00:01:35,580 +The features attribute of a Dataset + +34 +00:01:35,580 --> 00:01:37,470 +gives us more information +about its columns. + +35 +00:01:37,470 --> 00:01:40,020 +In particular, we can see here + +36 +00:01:40,020 --> 00:01:41,400 +it gives us the correspondence + +37 +00:01:41,400 --> 00:01:44,810 +between the integers and +names for the labels. + +38 +00:01:44,810 --> 00:01:48,543 +Zero stands for not equivalent +and one for equivalent. + +39 +00:01:49,830 --> 00:01:52,020 +To preprocess all the +elements of our dataset, + +40 +00:01:52,020 --> 00:01:53,850 +we need to tokenize them. + +41 +00:01:53,850 --> 00:01:56,160 +Have a look at the video +"Preprocess sentence pairs" + +42 +00:01:56,160 --> 00:01:57,570 +for a refresher, + +43 +00:01:57,570 --> 00:01:59,430 +but you just have to +send the two sentences + +44 +00:01:59,430 --> 00:02:02,733 +to the tokenizer with some +additional keyword arguments. + +45 +00:02:03,780 --> 00:02:06,600 +Here we indicate a maximum length of 128 + +46 +00:02:06,600 --> 00:02:08,820 +and pad inputs shorter than this length, + +47 +00:02:08,820 --> 00:02:10,420 +truncate inputs that are longer. + +48 +00:02:11,460 --> 00:02:13,470 +We put all of this in a tokenize_function + +49 +00:02:13,470 --> 00:02:16,710 +that we can directly apply to +all the splits in our dataset + +50 +00:02:16,710 --> 00:02:17,710 +with the map method. + +51 +00:02:18,840 --> 00:02:22,110 +As long as the function returns +a dictionary-like object, + +52 +00:02:22,110 --> 00:02:24,300 +the map method will add +new columns as needed + +53 +00:02:24,300 --> 00:02:26,043 +or update existing ones. + +54 +00:02:27,315 --> 00:02:28,830 +To speed up preprocessing + +55 +00:02:28,830 --> 00:02:30,870 +and take advantage of +the fact our tokenizer + +56 +00:02:30,870 --> 00:02:32,040 +is backed by Rust, + +57 +00:02:32,040 --> 00:02:34,770 +thanks to the Hugging +Face Tokenizers library, + +58 +00:02:34,770 --> 00:02:37,110 +we can process several +elements at the same time + +59 +00:02:37,110 --> 00:02:40,710 +to our tokenize function, using +the batched=True argument. + +60 +00:02:40,710 --> 00:02:42,120 +Since the tokenizer can handle + +61 +00:02:42,120 --> 00:02:44,610 +list of first sentences, +list of second sentences, + +62 +00:02:44,610 --> 00:02:47,493 +the tokenize_function does +not need to change for this. + +63 +00:02:48,360 --> 00:02:51,180 +You can also use multiprocessing +with the map method. + +64 +00:02:51,180 --> 00:02:53,583 +Check out its documentation +in the linked video. + +65 +00:02:54,840 --> 00:02:57,990 +Once this is done, we are +almost ready for training. + +66 +00:02:57,990 --> 00:02:59,970 +We just remove the columns +we don't need anymore + +67 +00:02:59,970 --> 00:03:02,190 +with the remove_columns method, + +68 +00:03:02,190 --> 00:03:03,750 +rename label to labels, + +69 +00:03:03,750 --> 00:03:05,790 +since the models from the +Hugging Face Transformers + +70 +00:03:05,790 --> 00:03:07,710 +library expect that, + +71 +00:03:07,710 --> 00:03:10,470 +and set the output format +to our desired backend, + +72 +00:03:10,470 --> 00:03:12,053 +Torch, TensorFlow or NumPy. + +73 +00:03:13,440 --> 00:03:16,800 +If needed, we can also generate +a short sample of a dataset + +74 +00:03:16,800 --> 00:03:18,000 +using the select method. + +75 +00:03:20,211 --> 00:03:22,961 +(slide whooshes) + diff --git a/subtitles/en/20_hugging-face-datasets-overview-(tensorflow).srt b/subtitles/en/20_hugging-face-datasets-overview-(tensorflow).srt index 56d938c9d..5daa26c88 100644 --- a/subtitles/en/20_hugging-face-datasets-overview-(tensorflow).srt +++ b/subtitles/en/20_hugging-face-datasets-overview-(tensorflow).srt @@ -1,164 +1,320 @@ -1 -00:00:05,200 --> 00:00:11,200 -The Hugging Face Datasets library: A Quick  -overview. The Hugging Face Datasets library   - -2 -00:00:11,200 --> 00:00:15,920 -is a library that provides an API to quickly  -download many public datasets and preprocess them.   - -3 -00:00:16,880 --> 00:00:22,480 -In this video we will explore how to do that. The  -downloading part is easy: with the load_dataset   - -4 -00:00:22,480 --> 00:00:27,760 -function, you can directly download and cache a  -dataset from its identifier on the Dataset hub.   - -5 -00:00:29,040 --> 00:00:34,160 -Here we fetch the MRPC dataset from  -the GLUE benchmark, which is a dataset   - -6 -00:00:34,160 --> 00:00:38,000 -containing pairs of sentences where the  -task is to determine the paraphrases.   - -7 -00:00:39,360 --> 00:00:44,880 -The object returned by the load_dataset function  -is a DatasetDict, which is a sort of dictionary   - -8 -00:00:44,880 --> 00:00:50,800 -containing each split of our dataset. We can  -access each split by indexing with its name.   - -9 -00:00:51,520 --> 00:00:55,120 -This split is then an instance of  -the Dataset class, with columns   - -10 -00:00:55,680 --> 00:01:04,000 -(here sentence1, sentence2. label and idx) and  -rows. We can access a given element by its index.   - -11 -00:01:04,880 --> 00:01:10,240 -The amazing thing about the Hugging Face Datasets  -library is that everything is saved to disk using   - -12 -00:01:10,240 --> 00:01:15,280 -Apache Arrow, which means that even if your  -dataset is huge you won't get out of RAM:   - -13 -00:01:15,920 --> 00:01:21,760 -only the elements you request are loaded in  -memory. Accessing a slice of your dataset is   - -14 -00:01:21,760 --> 00:01:27,760 -as easy as one element. The result is then a  -dictionary with list of values for each keys   - -15 -00:01:28,480 --> 00:01:35,040 -(here the list of labels, the list of first  -sentences and the list of second sentences). The   - -16 -00:01:35,040 --> 00:01:40,720 -features attribute of a Dataset gives us more  -information about its columns. In particular,   - -17 -00:01:40,720 --> 00:01:45,040 -we can see here it gives us the correspondence  -between the integers and names for the labels.   - -18 -00:01:45,920 --> 00:01:53,840 -0 stands for not equivalent and 1 for equivalent.  -To preprocess all the elements of our dataset,   - -19 -00:01:53,840 --> 00:01:59,120 -we need to tokenize them. Have a look at the  -video "Preprocess sentence pairs" for a refresher,   - -20 -00:01:59,840 --> 00:02:04,480 -but you just have to send the two sentences to the  -tokenizer with some additional keyword arguments.   - -21 -00:02:05,760 --> 00:02:11,200 -Here we indicate a maximum length of 128  -and pad inputs shorter than this length,   - -22 -00:02:11,200 --> 00:02:17,040 -truncate inputs that are longer. We put all of  -this in a tokenize_function that we can directly   - -23 -00:02:17,040 --> 00:02:22,320 -apply to all the splits in our dataset with the  -map method. As long as the function returns a   - -24 -00:02:22,320 --> 00:02:27,760 -dictionary-like object, the map method will add  -new columns as needed or update existing ones.   - -25 -00:02:29,840 --> 00:02:34,960 -To speed up preprocessing and take advantage  -of the fact our tokenizer is backed by Rust   - -26 -00:02:34,960 --> 00:02:40,320 -thanks to the Hugging Face Tokenizers library,  -we can process several elements at the same time   - -27 -00:02:40,320 --> 00:02:46,800 -to our tokenize function, using the batched=True  -argument. Since the tokenizer can handle list   - -28 -00:02:46,800 --> 00:02:53,360 -of first/second sentences, the tokenize_function  -does not need to change for this. You can also use   - -29 -00:02:53,360 --> 00:03:00,320 -multiprocessing with the map method, check out its  -documentation! Once this is done, we are almost   - -30 -00:03:00,320 --> 00:03:05,360 -ready for training: we just remove the columns we  -don't need anymore with the remove_columns method,   - -31 -00:03:05,920 --> 00:03:10,320 -rename label to labels (since the models  -from Hugging Face Transformers expect that)   - -32 -00:03:11,200 --> 00:03:17,280 -and set the output format to our desired  -backend: torch, tensorflow or numpy. If needed,   - -33 -00:03:17,280 --> 00:03:27,440 -we can also generate a short sample  -of a dataset using the select method. +1 +00:00:00,170 --> 00:00:03,087 +(screen whooshing) + +2 +00:00:05,371 --> 00:00:09,690 +- The Hugging Face Datasets +library: A Quick overview. + +3 +00:00:09,690 --> 00:00:10,917 +The Hugging Face Datasets library + +4 +00:00:10,917 --> 00:00:12,870 +is a library that provides an API + +5 +00:00:12,870 --> 00:00:15,150 +to quickly download many public datasets + +6 +00:00:15,150 --> 00:00:16,200 +and pre-process them. + +7 +00:00:17,070 --> 00:00:19,473 +In this video we will explore to do that. + +8 +00:00:20,520 --> 00:00:23,730 +The downloading part is easy +with the load_dataset function, + +9 +00:00:23,730 --> 00:00:26,010 +you can directly download +and cache a dataset + +10 +00:00:26,010 --> 00:00:28,023 +from its identifier on the Dataset hub. + +11 +00:00:29,160 --> 00:00:33,690 +Here we fetch the MRPC dataset +from the GLUE benchmark, + +12 +00:00:33,690 --> 00:00:36,030 +is a dataset containing pairs of sentences + +13 +00:00:36,030 --> 00:00:38,380 +where the task is to +determine the paraphrases. + +14 +00:00:39,720 --> 00:00:42,120 +The object returned by +the load_dataset function + +15 +00:00:42,120 --> 00:00:45,090 +is a DatasetDict, which +is a sort of dictionary + +16 +00:00:45,090 --> 00:00:46,940 +containing each split of our dataset. + +17 +00:00:48,600 --> 00:00:51,780 +We can access each split +by indexing with its name. + +18 +00:00:51,780 --> 00:00:54,540 +This split is then an +instance of the Dataset class, + +19 +00:00:54,540 --> 00:00:57,423 +with columns, here sentence1, sentence2, + +20 +00:00:58,350 --> 00:01:00,813 +label and idx, and rows. + +21 +00:01:02,160 --> 00:01:05,220 +We can access a given +element by its index. + +22 +00:01:05,220 --> 00:01:08,220 +The amazing thing about the +Hugging Face Datasets library + +23 +00:01:08,220 --> 00:01:11,700 +is that everything is saved +to disk using Apache Arrow, + +24 +00:01:11,700 --> 00:01:14,460 +which means that even +if your dataset is huge + +25 +00:01:14,460 --> 00:01:16,219 +you won't get out of RAM, + +26 +00:01:16,219 --> 00:01:18,769 +only the elements you +request are loaded in memory. + +27 +00:01:19,920 --> 00:01:24,510 +Accessing a slice of your dataset +is as easy as one element. + +28 +00:01:24,510 --> 00:01:27,150 +The result is then a +dictionary with list of values + +29 +00:01:27,150 --> 00:01:30,630 +for each keys, here the list of labels, + +30 +00:01:30,630 --> 00:01:32,190 +the list of first sentences, + +31 +00:01:32,190 --> 00:01:33,840 +and the list of second sentences. + +32 +00:01:35,100 --> 00:01:37,080 +The features attribute of a Dataset + +33 +00:01:37,080 --> 00:01:39,840 +gives us more information +about its columns. + +34 +00:01:39,840 --> 00:01:42,150 +In particular, we can see here it gives us + +35 +00:01:42,150 --> 00:01:43,980 +a correspondence between the integers + +36 +00:01:43,980 --> 00:01:46,110 +and names for the labels. + +37 +00:01:46,110 --> 00:01:49,623 +0 stands for not equivalent +and 1 for equivalent. + +38 +00:01:51,630 --> 00:01:54,090 +To pre-process all the +elements of our dataset, + +39 +00:01:54,090 --> 00:01:55,980 +we need to tokenize them. + +40 +00:01:55,980 --> 00:01:58,470 +Have a look at the video +"Pre-process sentence pairs" + +41 +00:01:58,470 --> 00:02:01,800 +for a refresher, but you just +have to send the two sentences + +42 +00:02:01,800 --> 00:02:04,833 +to the tokenizer with some +additional keyword arguments. + +43 +00:02:05,880 --> 00:02:09,300 +Here we indicate a maximum length of 128 + +44 +00:02:09,300 --> 00:02:11,460 +and pad inputs shorter than this length, + +45 +00:02:11,460 --> 00:02:13,060 +truncate inputs that are longer. + +46 +00:02:14,040 --> 00:02:16,170 +We put all of this in a tokenize_function + +47 +00:02:16,170 --> 00:02:18,510 +that we can directly +apply to all the splits + +48 +00:02:18,510 --> 00:02:20,260 +in our dataset with the map method. + +49 +00:02:21,210 --> 00:02:24,120 +As long as the function returns +a dictionary-like object, + +50 +00:02:24,120 --> 00:02:26,580 +the map method will add +new columns as needed + +51 +00:02:26,580 --> 00:02:28,113 +or update existing ones. + +52 +00:02:30,060 --> 00:02:32,520 +To speed up pre-processing +and take advantage + +53 +00:02:32,520 --> 00:02:35,130 +of the fact our tokenizer +is backed by Rust + +54 +00:02:35,130 --> 00:02:38,160 +thanks to the Hugging +Face Tokenizers library, + +55 +00:02:38,160 --> 00:02:40,590 +we can process several +elements at the same time + +56 +00:02:40,590 --> 00:02:43,923 +in our tokenize function, using +the batched=True argument. + +57 +00:02:45,300 --> 00:02:46,980 +Since the tokenizer can handle a list + +58 +00:02:46,980 --> 00:02:50,280 +of first or second sentences, +the tokenize_function + +59 +00:02:50,280 --> 00:02:52,740 +does not need to change for this. + +60 +00:02:52,740 --> 00:02:55,410 +You can also use multiprocessing +with the map method, + +61 +00:02:55,410 --> 00:02:57,460 +check out its documentation linked below. + +62 +00:02:58,740 --> 00:03:02,130 +Once this is done, we are +almost ready for training, + +63 +00:03:02,130 --> 00:03:04,020 +we just remove the columns +we don't need anymore + +64 +00:03:04,020 --> 00:03:06,120 +with the remove_columns method, + +65 +00:03:06,120 --> 00:03:08,580 +rename label to labels, since the models + +66 +00:03:08,580 --> 00:03:11,430 +from the transformers library expect that, + +67 +00:03:11,430 --> 00:03:14,040 +and set the output format +to our desired backend, + +68 +00:03:14,040 --> 00:03:15,893 +torch, tensorflow or numpy. + +69 +00:03:16,800 --> 00:03:19,050 +If needed, we can also +generate a short sample + +70 +00:03:19,050 --> 00:03:21,377 +of a dataset using the select method. + +71 +00:03:22,817 --> 00:03:25,734 +(screen whooshing) + diff --git a/subtitles/en/21_preprocessing-sentence-pairs-(pytorch).srt b/subtitles/en/21_preprocessing-sentence-pairs-(pytorch).srt index 605a85a95..3199e76a7 100644 --- a/subtitles/en/21_preprocessing-sentence-pairs-(pytorch).srt +++ b/subtitles/en/21_preprocessing-sentence-pairs-(pytorch).srt @@ -1,149 +1,294 @@ -1 -00:00:05,200 --> 00:00:11,680 -How to preprocess pairs of sentences? We have  -seen how to tokenize single sentences and batch   - -2 -00:00:11,680 --> 00:00:18,080 -them together in the "Batching inputs together"  -video. If this code look unfamiliar to you,   - -3 -00:00:18,080 --> 00:00:24,160 -be sure to check that video again! Here we will  -focus on tasks that classify pairs of sentences.   - -4 -00:00:25,440 --> 00:00:30,960 -For instance, we may want to classify whether two  -texts are paraphrases or not. Here is an example   - -5 -00:00:30,960 --> 00:00:36,320 -taken from the Quora Question Pairs dataset,  -which focuses on identifying duplicate questions.   - -6 -00:00:37,360 --> 00:00:42,200 -In the first pair, the two questions  -are duplicates; in the second, they   - -7 -00:00:43,360 --> 00:00:47,120 -are not. Another pair classification problem  -is when we want to know if two sentences   - -8 -00:00:47,120 --> 00:00:54,000 -are logically related or not (a problem called  -Natural Language Inference or NLI). In this   - -9 -00:00:54,000 --> 00:00:59,680 -example taken from the MultiNLI dataset, we have  -a pair of sentences for each possible label:   - -10 -00:00:59,680 --> 00:01:04,560 -contradiction, neutral or entailment (which  -is a fancy way of saying the first sentence   - -11 -00:01:04,560 --> 00:01:09,280 -implies the second). So classifying pairs  -of sentences is a problem worth studying.   - -12 -00:01:10,080 --> 00:01:14,880 -In fact, in the GLUE benchmark (which is an  -academic benchmark for text classification),   - -13 -00:01:15,600 --> 00:01:19,600 -8 of the 10 datasets are focused  -on tasks using pairs of sentences.   - -14 -00:01:20,720 --> 00:01:24,240 -That's why models like BERT are often  -pretrained with a dual objective:   - -15 -00:01:25,120 --> 00:01:29,920 -on top of the language modeling objective, they  -often have an objective related to sentence pairs.   - -16 -00:01:31,040 --> 00:01:36,720 -For instance, during pretraining, BERT is shown  -pairs of sentences and must predict both the   - -17 -00:01:36,720 --> 00:01:41,040 -value of randomly masked tokens and whether  -the second sentence follows from the first.   - -18 -00:01:42,800 --> 00:01:46,640 -Fortunately, the tokenizer from the  -Transformers library has a nice API   - -19 -00:01:46,640 --> 00:01:52,000 -to deal with pairs of sentences: you just have  -to pass them as two arguments to the tokenizer.   - -20 -00:01:53,200 --> 00:01:57,600 -On top of the input IDs and the attention  -mask we studied already, it returns a new   - -21 -00:01:57,600 --> 00:02:02,800 -field called token type IDs, which tells the  -model which tokens belong to the first sentence   - -22 -00:02:03,440 --> 00:02:09,680 -and which ones belong to the second sentence.  -Zooming in a little bit, here are the input IDs,   - -23 -00:02:09,680 --> 00:02:14,480 -aligned with the tokens they correspond to,  -their respective token type ID and attention   - -24 -00:02:14,480 --> 00:02:21,360 -mask. We can see the tokenizer also added special  -tokens so we have a CLS token, the tokens from the   - -25 -00:02:21,360 --> 00:02:28,720 -first sentence, a SEP token, the tokens from the  -second sentence, and a final SEP token. If we have   - -26 -00:02:28,720 --> 00:02:33,760 -several pairs of sentences, we can tokenize them  -together by passing the list of first sentences,   - -27 -00:02:34,480 --> 00:02:39,360 -then the list of second sentences and all the  -keyword arguments we studied already, like   - -28 -00:02:39,360 --> 00:02:45,600 -padding=True. Zooming in at the result, we can see  -how the tokenizer added padding to the second pair   - -29 -00:02:45,600 --> 00:02:51,200 -of sentences, to make the two outputs the same  -length, and properly dealt with token type IDS   - -30 -00:02:51,200 --> 00:03:03,520 -and attention masks for the two sentences. This  -is then all ready to pass through our model! +1 +00:00:00,000 --> 00:00:03,083 +(graphics whooshing) + +2 +00:00:05,370 --> 00:00:07,413 +- How to pre-process pairs of sentences. + +3 +00:00:09,150 --> 00:00:11,340 +We have seen how to +tokenize single sentences + +4 +00:00:11,340 --> 00:00:12,877 +and batch them together in the, + +5 +00:00:12,877 --> 00:00:15,810 +"Batching inputs together video." + +6 +00:00:15,810 --> 00:00:18,330 +If this code look unfamiliar to you, + +7 +00:00:18,330 --> 00:00:20,030 +be sure to check that video again. + +8 +00:00:21,330 --> 00:00:24,543 +Here will focus on tasks that +classify pair of sentences. + +9 +00:00:25,620 --> 00:00:28,470 +For instance, we may want to +classify whether two texts + +10 +00:00:28,470 --> 00:00:30,360 +are paraphrased or not. + +11 +00:00:30,360 --> 00:00:32,880 +Here is an example taken +from the Quora Question Pairs + +12 +00:00:32,880 --> 00:00:37,530 +dataset, which focuses on +identifying duplicate questions. + +13 +00:00:37,530 --> 00:00:40,650 +In the first pair, the two +questions are duplicates, + +14 +00:00:40,650 --> 00:00:42,000 +in the second they are not. + +15 +00:00:43,283 --> 00:00:45,540 +Another pair classification problem is + +16 +00:00:45,540 --> 00:00:47,400 +when we want to know if two sentences are + +17 +00:00:47,400 --> 00:00:49,590 +logically related or not, + +18 +00:00:49,590 --> 00:00:53,970 +a problem called natural +language inference or NLI. + +19 +00:00:53,970 --> 00:00:57,000 +In this example, taken +from the MultiNLI data set, + +20 +00:00:57,000 --> 00:00:59,880 +we have a pair of sentences +for each possible label. + +21 +00:00:59,880 --> 00:01:02,490 +Contradiction, natural or entailment, + +22 +00:01:02,490 --> 00:01:04,680 +which is a fancy way of +saying the first sentence + +23 +00:01:04,680 --> 00:01:05,793 +implies the second. + +24 +00:01:06,930 --> 00:01:08,820 +So classifying pairs of +sentences is a problem + +25 +00:01:08,820 --> 00:01:10,260 +worth studying. + +26 +00:01:10,260 --> 00:01:12,630 +In fact, in the GLUE benchmark, + +27 +00:01:12,630 --> 00:01:15,750 +which is an academic benchmark +for text classification + +28 +00:01:15,750 --> 00:01:17,910 +eight of the 10 data sets are focused + +29 +00:01:17,910 --> 00:01:19,953 +on tasks using pairs of sentences. + +30 +00:01:20,910 --> 00:01:22,560 +That's why models like BERT + +31 +00:01:22,560 --> 00:01:25,320 +are often pre-trained +with a dual objective. + +32 +00:01:25,320 --> 00:01:27,660 +On top of the language modeling objective, + +33 +00:01:27,660 --> 00:01:31,230 +they often have an objective +related to sentence pairs. + +34 +00:01:31,230 --> 00:01:34,320 +For instance, during +pretraining BERT is shown + +35 +00:01:34,320 --> 00:01:36,810 +pairs of sentences and must predict both + +36 +00:01:36,810 --> 00:01:39,930 +the value of randomly masked +tokens, and whether the second + +37 +00:01:39,930 --> 00:01:41,830 +sentence follow from the first or not. + +38 +00:01:43,084 --> 00:01:45,930 +Fortunately, the tokenizer +from the Transformers library + +39 +00:01:45,930 --> 00:01:49,170 +has a nice API to deal +with pairs of sentences. + +40 +00:01:49,170 --> 00:01:51,270 +You just have to pass +them as two arguments + +41 +00:01:51,270 --> 00:01:52,120 +to the tokenizer. + +42 +00:01:53,430 --> 00:01:55,470 +On top of the input IDs +and the attention mask + +43 +00:01:55,470 --> 00:01:56,970 +we studied already, + +44 +00:01:56,970 --> 00:01:59,910 +it returns a new field +called token type IDs, + +45 +00:01:59,910 --> 00:02:01,790 +which tells the model which tokens belong + +46 +00:02:01,790 --> 00:02:03,630 +to the first sentence, + +47 +00:02:03,630 --> 00:02:05,943 +and which ones belong +to the second sentence. + +48 +00:02:07,290 --> 00:02:09,840 +Zooming in a little bit, +here has an input IDs + +49 +00:02:09,840 --> 00:02:12,180 +aligned with the tokens +they correspond to, + +50 +00:02:12,180 --> 00:02:15,213 +their respective token +type ID and attention mask. + +51 +00:02:16,080 --> 00:02:19,260 +We can see the tokenizer +also added special tokens. + +52 +00:02:19,260 --> 00:02:22,620 +So we have a CLS token, the +tokens from the first sentence, + +53 +00:02:22,620 --> 00:02:25,770 +a SEP token, the tokens +from the second sentence, + +54 +00:02:25,770 --> 00:02:27,003 +and a final SEP token. + +55 +00:02:28,500 --> 00:02:30,570 +If we have several pairs of sentences, + +56 +00:02:30,570 --> 00:02:32,840 +we can tokenize them +together by passing the list + +57 +00:02:32,840 --> 00:02:36,630 +of first sentences, then +the list of second sentences + +58 +00:02:36,630 --> 00:02:39,300 +and all the keyword +arguments we studied already + +59 +00:02:39,300 --> 00:02:40,353 +like padding=True. + +60 +00:02:41,940 --> 00:02:43,140 +Zooming in at the result, + +61 +00:02:43,140 --> 00:02:45,030 +we can see also tokenize added padding + +62 +00:02:45,030 --> 00:02:48,090 +to the second pair sentences +to make the two outputs + +63 +00:02:48,090 --> 00:02:51,360 +the same length, and properly +dealt with token type IDs + +64 +00:02:51,360 --> 00:02:53,643 +and attention masks for the two sentences. + +65 +00:02:54,900 --> 00:02:57,573 +This is then all ready to +pass through our model. + diff --git a/subtitles/en/22_preprocessing-sentence-pairs-(tensorflow).srt b/subtitles/en/22_preprocessing-sentence-pairs-(tensorflow).srt index 30bb9ba2e..980986853 100644 --- a/subtitles/en/22_preprocessing-sentence-pairs-(tensorflow).srt +++ b/subtitles/en/22_preprocessing-sentence-pairs-(tensorflow).srt @@ -1,153 +1,309 @@ -1 -00:00:05,440 --> 00:00:11,760 -How to preprocess pairs of sentences? We have  -seen how to tokenize single sentences and batch   - -2 -00:00:11,760 --> 00:00:17,280 -them together in the "Batching inputs together"  -video. If this code look unfamiliar to you,   - -3 -00:00:17,840 --> 00:00:23,680 -be sure to check that video again! Here we will  -focus on tasks that classify pairs of sentences.   - -4 -00:00:24,720 --> 00:00:30,400 -For instance, we may want to classify whether two  -texts are paraphrases or not. Here is an example   - -5 -00:00:30,400 --> 00:00:35,520 -taken from the Quora Question Pairs dataset,  -which focuses on identifying duplicate questions.   - -6 -00:00:36,960 --> 00:00:39,760 -In the first pair, the two  -questions are duplicates;   - -7 -00:00:40,400 --> 00:00:45,520 -in the second, they are not. Another pair  -classification problem is when we want to know   - -8 -00:00:45,520 --> 00:00:51,920 -if two sentences are logically related or not (a  -problem called Natural Language Inference or NLI).   - -9 -00:00:52,880 --> 00:00:58,480 -In this example taken from the MultiNLI dataset,  -we have a pair of sentences for each possible   - -10 -00:00:58,480 --> 00:01:04,560 -label: contradiction, neutral or entailment  -(which is a fancy way of saying the first sentence   - -11 -00:01:04,560 --> 00:01:12,240 -implies the second). So classifying pairs of  -sentences is a problem worth studying. In fact,   - -12 -00:01:12,240 --> 00:01:15,840 -in the GLUE benchmark (which is an academic  -benchmark for text classification),   - -13 -00:01:16,640 --> 00:01:20,800 -8 of the 10 datasets are focused  -on tasks using pairs of sentences.   - -14 -00:01:21,920 --> 00:01:26,320 -That's why models like BERT are often  -pretrained with a dual objective:   - -15 -00:01:26,320 --> 00:01:31,040 -on top of the language modeling objective, they  -often have an objective related to sentence pairs.   - -16 -00:01:31,840 --> 00:01:37,520 -For instance, during pretraining, BERT is shown  -pairs of sentences and must predict both the   - -17 -00:01:37,520 --> 00:01:42,080 -value of randomly masked tokens and whether  -the second sentence follows from the first.   - -18 -00:01:43,840 --> 00:01:47,920 -Fortunately, the tokenizer from the  -Transformers library has a nice API   - -19 -00:01:47,920 --> 00:01:53,840 -to deal with pairs of sentences: you just have  -to pass them as two arguments to the tokenizer.   - -20 -00:01:54,640 --> 00:01:59,040 -On top of the input IDs and the attention  -mask we studied already, it returns a new   - -21 -00:01:59,040 --> 00:02:04,320 -field called token type IDs, which tells the  -model which tokens belong to the first sentence   - -22 -00:02:04,880 --> 00:02:11,280 -and which ones belong to the second sentence.  -Zooming in a little bit, here are the input IDs,   - -23 -00:02:11,280 --> 00:02:16,800 -aligned with the tokens they correspond to, their  -respective token type ID and attention mask.   - -24 -00:02:18,240 --> 00:02:23,440 -We can see the tokenizer also added special  -tokens so we have a CLS token, the tokens   - -25 -00:02:23,440 --> 00:02:29,920 -from the first sentence, a SEP token, the tokens  -from the second sentence, and a final SEP token.   - -26 -00:02:31,440 --> 00:02:36,640 -If we have several pairs of sentences, we can  -tokenize them together by passing the list of   - -27 -00:02:36,640 --> 00:02:42,880 -first sentences, then the list of second sentences  -and all the keyword arguments we studied already,   - -28 -00:02:42,880 --> 00:02:48,800 -like padding=True. Zooming in at the result,  -we can see how the tokenizer added padding   - -29 -00:02:48,800 --> 00:02:52,480 -to the second pair of sentences, to  -make the two outputs the same length,   - -30 -00:02:53,440 --> 00:02:57,280 -and properly dealt with token type IDS  -and attention masks for the two sentences.   - -31 -00:02:58,720 --> 00:03:03,840 -This is then all ready to pass through our model! +1 +00:00:00,225 --> 00:00:02,892 +(air whooshing) + +2 +00:00:05,578 --> 00:00:09,180 +- How to preprocess pairs of sentences? + +3 +00:00:09,180 --> 00:00:11,490 +We have seen how to +tokenize single sentences + +4 +00:00:11,490 --> 00:00:13,020 +and batch them together + +5 +00:00:13,020 --> 00:00:15,660 +in the "Batching inputs together" video. + +6 +00:00:15,660 --> 00:00:18,060 +If this code looks unfamiliar to you, + +7 +00:00:18,060 --> 00:00:19,760 +be sure to check that video again! + +8 +00:00:21,101 --> 00:00:22,110 +Here, we will focus on tasks + +9 +00:00:22,110 --> 00:00:24,033 +that classify pairs of sentences. + +10 +00:00:24,900 --> 00:00:27,030 +For instance, we may want to classify + +11 +00:00:27,030 --> 00:00:29,820 +whether two texts are paraphrases or not. + +12 +00:00:29,820 --> 00:00:30,900 +Here is an example taken + +13 +00:00:30,900 --> 00:00:33,180 +from the Quora Question Pairs dataset, + +14 +00:00:33,180 --> 00:00:36,033 +which focuses on identifying +duplicate questions. + +15 +00:00:37,110 --> 00:00:40,650 +In the first pair, the two +questions are duplicates; + +16 +00:00:40,650 --> 00:00:43,620 +in the second, they are not. + +17 +00:00:43,620 --> 00:00:44,730 +Another classification problem + +18 +00:00:44,730 --> 00:00:46,980 +is when we want to know if two sentences + +19 +00:00:46,980 --> 00:00:49,290 +are logically related or not, + +20 +00:00:49,290 --> 00:00:52,173 +a problem called Natural +Language Inference or NLI. + +21 +00:00:53,100 --> 00:00:55,830 +In this example taken +from the MultiNLI dataset, + +22 +00:00:55,830 --> 00:00:59,460 +we have a pair of sentences +for each possible label: + +23 +00:00:59,460 --> 00:01:02,400 +contradiction, neutral or entailment, + +24 +00:01:02,400 --> 00:01:04,680 +which is a fancy way of +saying the first sentence + +25 +00:01:04,680 --> 00:01:05,853 +implies the second. + +26 +00:01:07,140 --> 00:01:09,000 +So classifying pairs of sentences + +27 +00:01:09,000 --> 00:01:10,533 +is a problem worth studying. + +28 +00:01:11,370 --> 00:01:13,770 +In fact, in the GLUE benchmark, + +29 +00:01:13,770 --> 00:01:16,830 +which is an academic benchmark +for text classification, + +30 +00:01:16,830 --> 00:01:19,680 +eight of the 10 datasets +are focused on tasks + +31 +00:01:19,680 --> 00:01:20,973 +using pairs of sentences. + +32 +00:01:22,110 --> 00:01:24,720 +That's why models like +BERT are often pretrained + +33 +00:01:24,720 --> 00:01:26,520 +with a dual objective: + +34 +00:01:26,520 --> 00:01:28,890 +on top of the language modeling objective, + +35 +00:01:28,890 --> 00:01:32,010 +they often have an objective +related to sentence pairs. + +36 +00:01:32,010 --> 00:01:34,560 +For instance, during pretraining, + +37 +00:01:34,560 --> 00:01:36,690 +BERT is shown pairs of sentences + +38 +00:01:36,690 --> 00:01:39,900 +and must predict both the +value of randomly masked tokens + +39 +00:01:39,900 --> 00:01:41,250 +and whether the second sentence + +40 +00:01:41,250 --> 00:01:42,903 +follows from the first or not. + +41 +00:01:44,070 --> 00:01:47,100 +Fortunately, the tokenizer +from the Transformers library + +42 +00:01:47,100 --> 00:01:50,550 +has a nice API to deal +with pairs of sentences: + +43 +00:01:50,550 --> 00:01:52,650 +you just have to pass +them as two arguments + +44 +00:01:52,650 --> 00:01:53,613 +to the tokenizer. + +45 +00:01:54,900 --> 00:01:56,040 +On top of the input IDs + +46 +00:01:56,040 --> 00:01:58,440 +and the attention mask we studied already, + +47 +00:01:58,440 --> 00:02:01,530 +it returns a new field +called token type IDs, + +48 +00:02:01,530 --> 00:02:03,210 +which tells the model which tokens + +49 +00:02:03,210 --> 00:02:05,100 +belong to the first sentence + +50 +00:02:05,100 --> 00:02:07,350 +and which ones belong +to the second sentence. + +51 +00:02:08,670 --> 00:02:11,430 +Zooming in a little bit, +here are the input IDs, + +52 +00:02:11,430 --> 00:02:13,710 +aligned with the tokens +they correspond to, + +53 +00:02:13,710 --> 00:02:17,193 +their respective token +type ID and attention mask. + +54 +00:02:18,540 --> 00:02:21,300 +We can see the tokenizer +also added special tokens + +55 +00:02:21,300 --> 00:02:25,230 +so we have a CLS token, the +tokens from the first sentence, + +56 +00:02:25,230 --> 00:02:28,590 +a SEP token, the tokens +from the second sentence, + +57 +00:02:28,590 --> 00:02:30,153 +and a final SEP token. + +58 +00:02:31,680 --> 00:02:33,720 +If we have several pairs of sentences, + +59 +00:02:33,720 --> 00:02:35,640 +we can tokenize them together + +60 +00:02:35,640 --> 00:02:38,280 +by passing the list of first sentences, + +61 +00:02:38,280 --> 00:02:40,710 +then the list of second sentences + +62 +00:02:40,710 --> 00:02:43,050 +and all the keyword +arguments we studied already, + +63 +00:02:43,050 --> 00:02:44,133 +like padding=True. + +64 +00:02:45,510 --> 00:02:46,770 +Zooming in at the result, + +65 +00:02:46,770 --> 00:02:49,050 +we can see how the tokenizer added padding + +66 +00:02:49,050 --> 00:02:50,940 +to the second pair of sentences, + +67 +00:02:50,940 --> 00:02:53,490 +to make the two outputs the same length. + +68 +00:02:53,490 --> 00:02:55,620 +It also properly dealt with token type IDS + +69 +00:02:55,620 --> 00:02:57,720 +and attention masks for the two sentences. + +70 +00:02:59,010 --> 00:03:01,460 +This is then all ready to +pass through our model! + +71 +00:03:03,799 --> 00:03:06,466 +(air whooshing) + diff --git a/subtitles/en/23_what-is-dynamic-padding.srt b/subtitles/en/23_what-is-dynamic-padding.srt index 48fbfb69d..64514035a 100644 --- a/subtitles/en/23_what-is-dynamic-padding.srt +++ b/subtitles/en/23_what-is-dynamic-padding.srt @@ -1,188 +1,300 @@ -1 -00:00:05,270 --> 00:00:07,640 -What is dynamic padding? - -2 -00:00:07,640 --> 00:00:12,620 -In the "Batching Inputs together" video, we -have seen that to be able to group inputs - -3 -00:00:12,620 --> 00:00:17,320 -of different lengths in the same batch, we -need to add padding tokens to all the short - -4 -00:00:17,320 --> 00:00:20,520 -inputs until they are all of the same length. - -5 -00:00:20,520 --> 00:00:26,300 -Here for instance, the longest sentence is -the third one, and we need to add 5, 2 and - -6 -00:00:26,300 --> 00:00:32,509 -7 pad tokens to the other to have four sentences -of the same lengths. - -7 -00:00:32,509 --> 00:00:37,530 -When dealing with a whole dataset, there are -various padding strategies we can apply. - -8 -00:00:37,530 --> 00:00:41,870 -The most obvious one is to pad all the elements -of the dataset to the same length: the length - -9 -00:00:41,870 --> 00:00:44,129 -of the longest sample. - -10 -00:00:44,129 --> 00:00:48,450 -This will then give us batches that all have -the same shape determined by the maximum sequence - -11 -00:00:48,450 --> 00:00:49,450 -length. - -12 -00:00:49,450 --> 00:00:54,039 -The downside is that batches composed from -short sentences will have a lot of padding - -13 -00:00:54,039 --> 00:01:00,080 -tokens which introduce more computations in -the model we ultimately don't need. - -14 -00:01:00,080 --> 00:01:05,320 -To avoid this, another strategy is to pad -the elements when we batch them together, - -15 -00:01:05,320 --> 00:01:08,240 -to the longest sentence inside the batch. - -16 -00:01:08,240 --> 00:01:12,880 -This way batches composed of short inputs -will be smaller than the batch containing - -17 -00:01:12,880 --> 00:01:15,600 -the longest sentence in the dataset. - -18 -00:01:15,600 --> 00:01:19,090 -This will yield some nice speedup on CPU and -GPU. - -19 -00:01:19,090 --> 00:01:23,130 -The downside is that all batches will then -have different shapes, which slows down training - -20 -00:01:23,130 --> 00:01:24,790 -on other accelerators like TPUs. - -21 -00:01:24,790 --> 00:01:28,850 -Let's see how to apply both strategies in -practice. - -22 -00:01:28,850 --> 00:01:34,750 -We have actually seen how to apply fixed padding -in the Datasets Overview video, when we preprocessed - -23 -00:01:34,750 --> 00:01:39,320 -the MRPC dataset: after loading the dataset -and tokenizer, we applied the tokenization - -24 -00:01:39,320 --> 00:01:45,260 -to all the dataset with padding and truncation -to make all samples of length 128. - -25 -00:01:45,260 --> 00:01:51,630 -As a result, if we pass this dataset to a -PyTorch DataLoader, we get batches of shape - -26 -00:01:51,630 --> 00:01:57,079 -batch size (here 16) by 128. - -27 -00:01:57,079 --> 00:02:01,950 -To apply dynamic padding, we must defer the -padding to the batch preparation, so we remove - -28 -00:02:01,950 --> 00:02:04,789 -that part from our tokenize function. - -29 -00:02:04,789 --> 00:02:08,569 -We still leave the truncation part so that -inputs that are bigger than the maximum length - -30 -00:02:08,569 --> 00:02:14,069 -accepted by the model (usually 512) get truncated -to that length. - -31 -00:02:14,069 --> 00:02:17,629 -Then we pad our samples dynamically by using -a data collator. - -32 -00:02:17,629 --> 00:02:22,110 -Those classes in the Transformers library -are responsible for applying all the final - -33 -00:02:22,110 --> 00:02:27,970 -processing needed before forming a batch, -here DataCollatorWithPadding will pad the - -34 -00:02:27,970 --> 00:02:32,200 -samples to the maximum length inside the batch -of sentences. - -35 -00:02:32,200 --> 00:02:36,790 -We pass it to the PyTorch DataLoader as a -collate function, then observe that the batches - -36 -00:02:36,790 --> 00:02:42,950 -generated have various lenghs, all way below -the 128 from before. - -37 -00:02:42,950 --> 00:02:48,200 -Dynamic batching will almost always be faster -on CPUs and GPUs, so you should apply it if - -38 -00:02:48,200 --> 00:02:49,200 -you can. - -39 -00:02:49,200 --> 00:02:53,879 -Remember to switch back to fixed padding however -if you run your training script on TPU or - -40 -00:02:53,879 --> 00:03:00,599 -need batches of fixed shapes. +1 +00:00:00,242 --> 00:00:02,909 +(air whooshing) + +2 +00:00:05,460 --> 00:00:06,963 +- What is dynamic padding? + +3 +00:00:08,630 --> 00:00:10,890 +In the "Batching Inputs together" video, + +4 +00:00:10,890 --> 00:00:12,720 +we have seen that to +be able to group inputs + +5 +00:00:12,720 --> 00:00:15,300 +of different lengths in the same batch, + +6 +00:00:15,300 --> 00:00:18,270 +we need to add padding tokens +to all the short inputs + +7 +00:00:18,270 --> 00:00:20,970 +until they are all of the same length. + +8 +00:00:20,970 --> 00:00:24,600 +Here, for instance, the longest +sentence is the third one, + +9 +00:00:24,600 --> 00:00:27,270 +and we need to add five, +two, or seven pad tokens + +10 +00:00:27,270 --> 00:00:30,090 +to the other sentences +to have four sentences + +11 +00:00:30,090 --> 00:00:31,090 +of the same lengths. + +12 +00:00:32,430 --> 00:00:33,900 +When dealing with a whole dataset, + +13 +00:00:33,900 --> 00:00:36,633 +there are various padding +strategies we can apply. + +14 +00:00:37,560 --> 00:00:39,540 +The most obvious one is +to pad all the elements + +15 +00:00:39,540 --> 00:00:40,923 +of the dataset to the same length: + +16 +00:00:40,923 --> 00:00:43,053 +the length of the longest sample. + +17 +00:00:44,070 --> 00:00:45,330 +This will then give us batches + +18 +00:00:45,330 --> 00:00:46,890 +that all have the same shape + +19 +00:00:46,890 --> 00:00:49,800 +determined by the maximum sequence length. + +20 +00:00:49,800 --> 00:00:52,893 +The downside is that batches +composed from short sentences + +21 +00:00:52,893 --> 00:00:54,960 +will have a lot of padding tokens + +22 +00:00:54,960 --> 00:00:57,660 +which will introduce more +computations in the model + +23 +00:00:57,660 --> 00:00:58,910 +we ultimately don't need. + +24 +00:01:00,060 --> 00:01:03,300 +To avoid this, another +strategy is to pad the elements + +25 +00:01:03,300 --> 00:01:05,280 +when we batch them together, + +26 +00:01:05,280 --> 00:01:08,190 +to the longest sentence inside the batch. + +27 +00:01:08,190 --> 00:01:12,000 +This way, batches composed of +short inputs will be smaller + +28 +00:01:12,000 --> 00:01:13,920 +than the batch containing +the longest sentence + +29 +00:01:13,920 --> 00:01:15,510 +in the dataset. + +30 +00:01:15,510 --> 00:01:18,063 +This will yield some nice +speedup on CPU and GPU. + +31 +00:01:19,110 --> 00:01:20,490 +The downside is that all batches + +32 +00:01:20,490 --> 00:01:22,140 +will then have different shapes, + +33 +00:01:22,140 --> 00:01:24,740 +which slows down training +on accelerators like TPUs. + +34 +00:01:26,160 --> 00:01:29,370 +Let's see how to apply both +strategies in practice. + +35 +00:01:29,370 --> 00:01:31,280 +We have actually seen how +to apply fixed padding + +36 +00:01:31,280 --> 00:01:33,390 +in the Datasets Overview video, + +37 +00:01:33,390 --> 00:01:36,030 +when we preprocessed the MRPC dataset: + +38 +00:01:36,030 --> 00:01:38,250 +after loading the dataset and tokenizer, + +39 +00:01:38,250 --> 00:01:40,680 +we applied the tokenization +to all the dataset + +40 +00:01:40,680 --> 00:01:42,480 +with padding and truncation + +41 +00:01:42,480 --> 00:01:45,273 +to make all samples of length 128. + +42 +00:01:46,530 --> 00:01:48,360 +As a result, if we pass this dataset + +43 +00:01:48,360 --> 00:01:50,520 +to a PyTorch DataLoader, + +44 +00:01:50,520 --> 00:01:55,503 +we get batches of shape +batch size, here 16, by 128. + +45 +00:01:57,060 --> 00:01:58,380 +To apply dynamic padding, + +46 +00:01:58,380 --> 00:02:01,440 +we must defer the padding +to the batch preparation, + +47 +00:02:01,440 --> 00:02:04,740 +so we remove that part +from our tokenize function. + +48 +00:02:04,740 --> 00:02:06,150 +We still leave the truncation part + +49 +00:02:06,150 --> 00:02:08,580 +so that inputs that are +bigger than the maximum length + +50 +00:02:08,580 --> 00:02:12,060 +accepted by the model, usually 512, + +51 +00:02:12,060 --> 00:02:13,510 +get truncated to that length. + +52 +00:02:14,940 --> 00:02:16,380 +Then we pad our samples dynamically + +53 +00:02:16,380 --> 00:02:18,330 +by using a data collator. + +54 +00:02:18,330 --> 00:02:20,280 +Those classes in the Transformers library + +55 +00:02:20,280 --> 00:02:22,740 +are responsible for applying +all the final processing + +56 +00:02:22,740 --> 00:02:25,290 +needed before forming a batch, + +57 +00:02:25,290 --> 00:02:28,470 +here DataCollatorWithPadding +will pad the samples + +58 +00:02:28,470 --> 00:02:31,083 +to the maximum length inside +the batch of sentences. + +59 +00:02:32,160 --> 00:02:35,310 +We pass it to the PyTorch +DataLoader as a collate function, + +60 +00:02:35,310 --> 00:02:37,620 +then observe that the batches generated + +61 +00:02:37,620 --> 00:02:38,850 +have various lengths, + +62 +00:02:38,850 --> 00:02:41,253 +all way below the 128 from before. + +63 +00:02:42,660 --> 00:02:44,820 +Dynamic batching will +almost always be faster + +64 +00:02:44,820 --> 00:02:47,913 +on CPUs and GPUs, so you +should apply it if you can. + +65 +00:02:48,930 --> 00:02:51,330 +Remember to switch back +to fixed padding, however, + +66 +00:02:51,330 --> 00:02:53,490 +if you run your training script on TPU + +67 +00:02:53,490 --> 00:02:55,293 +or need batches of fixed shapes. + +68 +00:02:56,917 --> 00:02:59,584 +(air whooshing) + diff --git a/subtitles/en/24_the-trainer-api.srt b/subtitles/en/24_the-trainer-api.srt index bee53acec..55405374a 100644 --- a/subtitles/en/24_the-trainer-api.srt +++ b/subtitles/en/24_the-trainer-api.srt @@ -1,174 +1,382 @@ -1 -00:00:05,280 --> 00:00:11,200 -The Trainer API. The Transformers library  -provides a Trainer API that allows you to   - -2 -00:00:11,200 --> 00:00:17,040 -easily fine-tune transformer models on your own  -dataset. The Trainer class take your datasets,   - -3 -00:00:17,040 --> 00:00:22,240 -your model as well as the training hyperparameters  -and can perform the training on any kind of   - -4 -00:00:22,240 --> 00:00:30,160 -setup (CPU, GPU, multi GPUs, TPUs). It can also  -compute the predictions on any dataset, and if   - -5 -00:00:30,160 --> 00:00:36,720 -you provided metrics, evaluate your model on any  -dataset. It can also handle final data-processing   - -6 -00:00:36,720 --> 00:00:41,760 -such as dynamic padding as long as you provide  -the tokenizer or a given data collator.   - -7 -00:00:43,040 --> 00:00:48,160 -We will try this API on the MRPC dataset, since  -it's relatively small and easy to preprocess.   - -8 -00:00:49,520 --> 00:00:54,800 -As we saw in the Datasets overview video, here  -is how we can preprocess it. We do not apply   - -9 -00:00:54,800 --> 00:00:59,840 -padding during the preprocessing as we will use  -dynamic padding with our DataCollatorWithPadding.   - -10 -00:01:00,960 --> 00:01:05,440 -Note that we don't do the final steps of  -renaming/removing columns or set the format   - -11 -00:01:05,440 --> 00:01:11,280 -to torch tensors: the Trainer will do all of  -this automatically for us by analyzing the   - -12 -00:01:11,280 --> 00:01:18,080 -model signature. The last steps before creating  -the Trainer are to define our model and some   - -13 -00:01:18,080 --> 00:01:24,400 -training hyperparameters. We saw how to do the  -first in the model API video. For the second,   - -14 -00:01:24,400 --> 00:01:29,600 -we use the TrainingArguments class. It only needs  -a path to a folder where results and checkpoints   - -15 -00:01:29,600 --> 00:01:34,240 -will be saved, but you can also customize  -all the hyperparameters the Trainer will use:   - -16 -00:01:34,240 --> 00:01:39,600 -learning rate, number of training epochs etc.  -It's then very easy to create a Trainer and   - -17 -00:01:39,600 --> 00:01:44,720 -launch a training. This should display a progress  -bar and after a few minutes (if you are running   - -18 -00:01:44,720 --> 00:01:50,480 -on a GPU) you should have the training finished.  -The result will be rather anticlimatic however,   - -19 -00:01:50,480 --> 00:01:54,880 -as you will only get a training loss which  -doesn't really tell you anything about how you   - -20 -00:01:54,880 --> 00:01:59,920 -model is performing. This is because we didn't  -specify anything metric for the evaluation.   - -21 -00:02:00,960 --> 00:02:05,520 -To get those metrics, we will first gather the  -predictions on the whole evaluation set using the   - -22 -00:02:05,520 --> 00:02:11,760 -predict method. It returns a namedtuple with three  -fields: predictions (which contains the model   - -23 -00:02:11,760 --> 00:02:17,760 -predictions), label_ids (which contains the labels  -if your dataset had them) and metrics (which is   - -24 -00:02:17,760 --> 00:02:24,480 -empty here). The predictions are the logits of  -the models for all the sentences in the dataset,   - -25 -00:02:24,480 --> 00:02:31,440 -so a NumPy array of shape 408 by 2. To match them  -with our labels, we need to take the maximum logit   - -26 -00:02:31,440 --> 00:02:36,560 -for each prediction (to know which of the two  -classes was predicted), which we do with the   - -27 -00:02:36,560 --> 00:02:42,480 -argmax function. Then we can use a Metric from  -the Datasets library: it can be loaded as easily   - -28 -00:02:42,480 --> 00:02:47,200 -as our dataset with the load_metric function,  -and it returns the evaluation metric used for   - -29 -00:02:47,200 --> 00:02:54,080 -the dataser we are using. We can see our model  -did learn something as it is 85.7% accurate.   - -30 -00:02:55,200 --> 00:02:59,920 -To monitor the evaluation metrics during training  -we need to define a compute_metrics function   - -31 -00:02:59,920 --> 00:03:05,200 -that does the same step as before: it takes  -a namedtuple with predictions and labels   - -32 -00:03:05,200 --> 00:03:08,000 -and must return a dictionary with  -the metric we want to keep track of.   - -33 -00:03:09,120 --> 00:03:14,400 -By passing the epoch evaluation strategy to our  -TrainingArguments, we tell the Trainer to evaluate   - -34 -00:03:14,400 --> 00:03:20,400 -at the end of every epoch. Launching a training  -inside a notebook will then display a progress bar   - -35 -00:03:20,400 --> 00:03:29,920 -and complete the table you see  -here as you pass every epoch. +1 +00:00:00,304 --> 00:00:01,285 +(air whooshing) + +2 +00:00:01,285 --> 00:00:02,345 +(air popping) + +3 +00:00:02,345 --> 00:00:05,698 +(air whooshing) + +4 +00:00:05,698 --> 00:00:06,548 +- So Trainer API. + +5 +00:00:08,070 --> 00:00:10,040 +So Transformers Library +provides a Trainer API + +6 +00:00:10,040 --> 00:00:13,320 +that allows you to easily +find tune transformers models + +7 +00:00:13,320 --> 00:00:14,193 +on your dataset. + +8 +00:00:15,150 --> 00:00:17,250 +So Trainer class takes your datasets, + +9 +00:00:17,250 --> 00:00:19,900 +your model as well as the +training hyperparameters + +10 +00:00:20,820 --> 00:00:23,310 +and can perform the training +on any kind of setup, + +11 +00:00:23,310 --> 00:00:26,654 +CPU, GPU, multiple GPUs, TPUs + +12 +00:00:26,654 --> 00:00:28,680 +can also compute the predictions + +13 +00:00:28,680 --> 00:00:31,710 +on any dataset and if you provided metrics + +14 +00:00:31,710 --> 00:00:33,813 +evaluate your model on any dataset. + +15 +00:00:34,950 --> 00:00:36,930 +You can also involve final data processing + +16 +00:00:36,930 --> 00:00:38,670 +such as dynamic padding, + +17 +00:00:38,670 --> 00:00:40,377 +as long as you provide the tokenizer + +18 +00:00:40,377 --> 00:00:42,693 +or given data collator. + +19 +00:00:43,572 --> 00:00:45,900 +We will try this API on the MRPC dataset, + +20 +00:00:45,900 --> 00:00:48,492 +since it's relatively small +and easy to preprocess. + +21 +00:00:48,492 --> 00:00:49,325 +As we saw in the Datasets overview video, + +22 +00:00:49,325 --> 00:00:54,325 +however we can preprocess it. + +23 +00:00:54,511 --> 00:00:57,030 +We do not apply padding +during the preprocessing, + +24 +00:00:57,030 --> 00:00:58,590 +as we will use dynamic padding + +25 +00:00:58,590 --> 00:01:00,083 +before DataCollatorWithPadding. + +26 +00:01:01,170 --> 00:01:02,790 +Note that we don't do the final steps + +27 +00:01:02,790 --> 00:01:04,830 +of renaming removing columns + +28 +00:01:04,830 --> 00:01:06,873 +or set the format to torch tensors. + +29 +00:01:07,710 --> 00:01:10,560 +So Trainer will do all of +this automatically for us + +30 +00:01:10,560 --> 00:01:12,633 +by analyzing the model signature. + +31 +00:01:14,054 --> 00:01:16,650 +The last step before +creating the Trainer are + +32 +00:01:16,650 --> 00:01:17,940 +to define a model + +33 +00:01:17,940 --> 00:01:20,250 +and some training hyperparameters. + +34 +00:01:20,250 --> 00:01:22,653 +We saw to do the first +in the model API video. + +35 +00:01:23,734 --> 00:01:26,790 +For the second we use the +TrainingArguments class. + +36 +00:01:26,790 --> 00:01:28,710 +It only takes a path to a folder + +37 +00:01:28,710 --> 00:01:30,900 +where results and +checkpoint will be saved, + +38 +00:01:30,900 --> 00:01:33,060 +but you can also customize +all the hyperparameters + +39 +00:01:33,060 --> 00:01:34,470 +your Trainer will use, + +40 +00:01:34,470 --> 00:01:37,270 +learning weight, number of +training impacts, et. cetera. + +41 +00:01:38,190 --> 00:01:39,660 +It's been very easy to create a Trainer + +42 +00:01:39,660 --> 00:01:41,400 +and launch a training. + +43 +00:01:41,400 --> 00:01:43,170 +You should display a progress bar + +44 +00:01:43,170 --> 00:01:45,900 +and after a few minutes +if you're running on a GPU + +45 +00:01:45,900 --> 00:01:48,000 +you should have the training finished. + +46 +00:01:48,000 --> 00:01:50,790 +The result will be rather +anticlimactic however, + +47 +00:01:50,790 --> 00:01:52,710 +as you will only get a training loss + +48 +00:01:52,710 --> 00:01:54,300 +which doesn't really tell you anything + +49 +00:01:54,300 --> 00:01:56,820 +about how well your model is performing. + +50 +00:01:56,820 --> 00:01:58,977 +This is because we +didn't specify any metric + +51 +00:01:58,977 --> 00:02:00,273 +for the evaluation. + +52 +00:02:01,200 --> 00:02:02,160 +To get those metrics, + +53 +00:02:02,160 --> 00:02:03,810 +we will first gather the predictions + +54 +00:02:03,810 --> 00:02:06,513 +on the whole evaluation set +using the predict method. + +55 +00:02:07,440 --> 00:02:10,020 +It returns a namedtuple with three fields, + +56 +00:02:10,020 --> 00:02:12,990 +Prediction, which contains +the model of predictions. + +57 +00:02:12,990 --> 00:02:15,030 +Label_IDs, which contains the labels + +58 +00:02:15,030 --> 00:02:16,800 +if your dataset had them + +59 +00:02:16,800 --> 00:02:18,570 +and metrics which is empty here. + +60 +00:02:18,570 --> 00:02:20,520 +We're trying to do that. + +61 +00:02:20,520 --> 00:02:22,470 +The predictions are the +logits of the models + +62 +00:02:22,470 --> 00:02:24,143 +for all the sentences in the dataset. + +63 +00:02:24,143 --> 00:02:27,513 +So a NumPy array of shape 408 by 2. + +64 +00:02:28,500 --> 00:02:30,270 +To match them with our labels, + +65 +00:02:30,270 --> 00:02:31,590 +we need to take the maximum logit + +66 +00:02:31,590 --> 00:02:32,850 +for each prediction + +67 +00:02:32,850 --> 00:02:35,820 +to know which of the two +classes was predicted. + +68 +00:02:35,820 --> 00:02:37,683 +We do this with the argmax function. + +69 +00:02:38,640 --> 00:02:41,550 +Then we can use a metric +from the Datasets library. + +70 +00:02:41,550 --> 00:02:43,500 +It can be loaded as easily as a dataset + +71 +00:02:43,500 --> 00:02:45,360 +with the load metric function + +72 +00:02:45,360 --> 00:02:49,500 +and each returns the evaluation +metric used for the dataset. + +73 +00:02:49,500 --> 00:02:51,600 +We can see our model did learn something + +74 +00:02:51,600 --> 00:02:54,363 +as it is 85.7% accurate. + +75 +00:02:55,440 --> 00:02:57,870 +To monitor the evaluation +matrix during training, + +76 +00:02:57,870 --> 00:02:59,829 +we need to define a +compute_metrics function + +77 +00:02:59,829 --> 00:03:02,670 +that does the same step as before. + +78 +00:03:02,670 --> 00:03:04,728 +It takes a namedtuple with +predictions and labels + +79 +00:03:04,728 --> 00:03:06,327 +and must return a dictionary + +80 +00:03:06,327 --> 00:03:08,427 +with the metrics we want to keep track of. + +81 +00:03:09,360 --> 00:03:11,490 +By passing the epoch evaluation strategy + +82 +00:03:11,490 --> 00:03:13,080 +to our training arguments, + +83 +00:03:13,080 --> 00:03:14,490 +we tell the Trainer to evaluate + +84 +00:03:14,490 --> 00:03:15,903 +at the end of every epoch. + +85 +00:03:17,280 --> 00:03:18,587 +Launching a training inside a notebook + +86 +00:03:18,587 --> 00:03:20,640 +will then display a progress bar + +87 +00:03:20,640 --> 00:03:23,643 +and complete the table you see +here as you pass every epoch. + +88 +00:03:25,400 --> 00:03:28,249 +(air whooshing) + +89 +00:03:28,249 --> 00:03:29,974 +(air decrescendos) + diff --git a/subtitles/en/25_keras-introduction.srt b/subtitles/en/25_keras-introduction.srt index cace81f9f..d2960bba1 100644 --- a/subtitles/en/25_keras-introduction.srt +++ b/subtitles/en/25_keras-introduction.srt @@ -1,129 +1,290 @@ -1 -00:00:05,120 --> 00:00:10,640 -In this video, I'm going to give you a very quick  -introduction to how our transformers models work   - -2 -00:00:10,640 --> 00:00:17,120 -together with Tensorflow and Keras! The very short  -explanation is that all of our Tensorflow models   - -3 -00:00:17,120 --> 00:00:23,760 -are also Keras model objects, and so they have the  -standard Keras model API. If you're an experienced   - -4 -00:00:23,760 --> 00:00:28,640 -ML engineer who's used Keras a lot, that's  -probably all you need to know to start working   - -5 -00:00:28,640 --> 00:00:34,160 -with them. But for everyone else, including  -the prodigal PyTorch engineers out there who   - -6 -00:00:34,160 --> 00:00:39,360 -are returning to the fold, I'm going to quickly  -introduce Keras models, and how we work with them.   - -7 -00:00:40,320 --> 00:00:46,240 -In other videos, which I'll link below, I'll run  -through training with Keras models in more detail.   - -8 -00:00:46,240 --> 00:00:54,640 -But first, what is a Keras model? Your model  -basically contains your entire network:   - -9 -00:00:54,640 --> 00:00:59,600 -It contains the layers, and the weights for  -those layers, and also tells the model what   - -10 -00:00:59,600 --> 00:01:04,560 -to do with them; it defines the whole path  -all the way from your inputs to your outputs.   - -11 -00:01:05,280 --> 00:01:10,880 -If you've used Keras before, you probably  -started by building your model out by   - -12 -00:01:10,880 --> 00:01:17,600 -hand - you added one layer after another, maybe  -using model.add() or the functional approach.   - -13 -00:01:18,480 --> 00:01:26,240 -And there's nothing wrong with that! But you can  -also pre-load an entire model, weights and all.   - -14 -00:01:26,960 --> 00:01:33,920 -This is really helpful, because if you try  -reading the paper or looking at the code,   - -15 -00:01:33,920 --> 00:01:38,400 -you'll see the inside of a Transformer is  -pretty complex, and writing it all out from   - -16 -00:01:38,400 --> 00:01:43,280 -scratch and getting it right would be hard even  -for an experienced machine learning engineer.   - -17 -00:01:43,280 --> 00:01:48,080 -But because it's all packed inside a Model, you  -don't need to worry about that complexity if   - -18 -00:01:48,080 --> 00:01:53,840 -you don't want to! You have the flexibility to  -write any model you like, but you can also just   - -19 -00:01:54,400 --> 00:01:58,640 -load a pre-trained, pre-configured  -transformer model in one line of code.   - -20 -00:02:00,000 --> 00:02:09,040 -And whether you write your own model from scratch  -or load a pre-trained one, you interact with the   - -21 -00:02:09,040 --> 00:02:14,560 -model in the same way - through the same few  -methods you're going to see again and again,   - -22 -00:02:15,200 --> 00:02:22,000 -like *fit*, *compile* and *predict,* and we'll  -cover concrete examples of how to use those   - -23 -00:02:22,000 --> 00:02:26,960 -methods in other videos that I'll link below. For  -now the key thing to take away from this video, if   - -24 -00:02:26,960 --> 00:02:31,920 -you've never seen Keras before, is that this neat  -encapsulation means that all of the complexity of   - -25 -00:02:31,920 --> 00:02:36,560 -a huge neural net becomes manageable, because  -you interact with it in exactly the same way,   - -26 -00:02:36,560 --> 00:02:49,760 -using exactly the same methods, as you would  -with a simple model that you wrote out by hand. +1 +00:00:00,430 --> 00:00:03,013 +(upbeat music) + +2 +00:00:05,160 --> 00:00:07,080 +- In this video, I'm going to give you + +3 +00:00:07,080 --> 00:00:10,350 +a very quick introduction to +how our transformer models + +4 +00:00:10,350 --> 00:00:14,040 +work together with Tensorflow and Keras. + +5 +00:00:14,040 --> 00:00:15,510 +The very short explanation + +6 +00:00:15,510 --> 00:00:17,310 +is that all of our Tensorflow models + +7 +00:00:17,310 --> 00:00:19,470 +are also Keras model objects, + +8 +00:00:19,470 --> 00:00:22,950 +and so they have the +standard Keras model API. + +9 +00:00:22,950 --> 00:00:24,960 +If you're an experienced +machine learning engineer + +10 +00:00:24,960 --> 00:00:28,230 +who's used Keras a lot, that's +probably all you need to know + +11 +00:00:28,230 --> 00:00:29,610 +to start working with them. + +12 +00:00:29,610 --> 00:00:30,900 +But for everyone else, + +13 +00:00:30,900 --> 00:00:34,170 +including the prodigal +PyTorch engineers out there + +14 +00:00:34,170 --> 00:00:35,910 +who are returning to the fold, + +15 +00:00:35,910 --> 00:00:38,430 +I'm going to quickly +introduce Keras models, + +16 +00:00:38,430 --> 00:00:40,440 +and how we work with them. + +17 +00:00:40,440 --> 00:00:43,080 +In other videos, which I'll link below, + +18 +00:00:43,080 --> 00:00:46,440 +I'll run through training with +Keras models in more detail. + +19 +00:00:46,440 --> 00:00:50,820 +But first, at a high level, +what is a Keras model? + +20 +00:00:50,820 --> 00:00:54,810 +So your model basically +contains your entire network. + +21 +00:00:54,810 --> 00:00:58,230 +It contains the layers, and +the weights for those layers, + +22 +00:00:58,230 --> 00:01:00,690 +and also tells the model +what to do with them + +23 +00:01:00,690 --> 00:01:02,880 +so it defines the whole path all the way + +24 +00:01:02,880 --> 00:01:05,460 +from your inputs to your outputs. + +25 +00:01:05,460 --> 00:01:07,380 +If you've used Keras before, + +26 +00:01:07,380 --> 00:01:09,480 +you probably started using model objects + +27 +00:01:09,480 --> 00:01:11,850 +by building them out by hand, + +28 +00:01:11,850 --> 00:01:14,250 +you added one layer after another + +29 +00:01:14,250 --> 00:01:18,690 +and maybe using the model.add() +or the functional approach. + +30 +00:01:18,690 --> 00:01:20,490 +And there's nothing wrong with that. + +31 +00:01:21,390 --> 00:01:23,430 +Lots of great models are built that way + +32 +00:01:23,430 --> 00:01:26,970 +but you can also pre-load an +entire model, weights and all. + +33 +00:01:26,970 --> 00:01:29,994 +And this is really +helpful, because if you, + +34 +00:01:29,994 --> 00:01:32,490 +as you can see here, if +you try reading the paper + +35 +00:01:32,490 --> 00:01:34,110 +or if you try looking at the code, + +36 +00:01:34,110 --> 00:01:37,350 +you'll see the inside of a +Transformer is pretty complex, + +37 +00:01:37,350 --> 00:01:40,110 +and writing it all out from +scratch and getting it right + +38 +00:01:40,110 --> 00:01:41,850 +would be hard even for an experienced + +39 +00:01:41,850 --> 00:01:43,500 +machine learning engineer. + +40 +00:01:43,500 --> 00:01:45,870 +But because it's all +packed inside a model, + +41 +00:01:45,870 --> 00:01:48,150 +you don't need to worry +about that complexity on that + +42 +00:01:48,150 --> 00:01:49,140 +if you don't want to. + +43 +00:01:49,140 --> 00:01:51,570 +If you're a researcher, if you +want to really dig in there + +44 +00:01:51,570 --> 00:01:55,650 +you can, but you can also +just load a pre-trained, + +45 +00:01:55,650 --> 00:01:59,013 +pre-configured transformer +model in just one line of code. + +46 +00:02:00,150 --> 00:02:03,480 +And when I mentioned +earlier about the Keras API, + +47 +00:02:03,480 --> 00:02:04,560 +the advantage of it is that + +48 +00:02:04,560 --> 00:02:06,690 +whether you write your +own model from scratch + +49 +00:02:06,690 --> 00:02:09,510 +or load a pre-trained one, +you interact with the model + +50 +00:02:09,510 --> 00:02:11,850 +through that same API, so you use exactly + +51 +00:02:11,850 --> 00:02:13,950 +the same few methods and +you're gonna see them + +52 +00:02:13,950 --> 00:02:16,380 +again and again, these methods like fit, + +53 +00:02:16,380 --> 00:02:19,650 +compile and predict, +and like I've mentioned + +54 +00:02:19,650 --> 00:02:22,530 +we'll cover concrete examples +of how to use those methods + +55 +00:02:22,530 --> 00:02:24,330 +in the videos I'll link below. + +56 +00:02:24,330 --> 00:02:27,000 +For now the key thing to +take away from this video, + +57 +00:02:27,000 --> 00:02:28,950 +if you've never seen Keras before, + +58 +00:02:28,950 --> 00:02:30,870 +is that this neat encapsulation means + +59 +00:02:30,870 --> 00:02:33,090 +that all the complexity +of a huge neural net + +60 +00:02:33,090 --> 00:02:35,430 +becomes manageable, because +you interact with it + +61 +00:02:35,430 --> 00:02:39,000 +in exactly the same way, using +exactly the same methods, + +62 +00:02:39,000 --> 00:02:41,700 +whether it's a huge +pre-trained language model + +63 +00:02:41,700 --> 00:02:43,950 +or a simple model that +you wrote out by hand. + +64 +00:02:45,466 --> 00:02:48,049 +(upbeat music) + diff --git a/subtitles/en/26_fine-tuning-with-tensorflow.srt b/subtitles/en/26_fine-tuning-with-tensorflow.srt index 259fc2f47..fb2536667 100644 --- a/subtitles/en/26_fine-tuning-with-tensorflow.srt +++ b/subtitles/en/26_fine-tuning-with-tensorflow.srt @@ -1,353 +1,567 @@ -1 -00:00:06,069 --> 00:00:11,580 -In this video, we're going to see how to load -and fine-tune a pre-trained model. - -2 -00:00:11,580 --> 00:00:16,010 -It's very quick, and if you've watched our -pipeline videos, which I'll link below, the - -3 -00:00:16,010 --> 00:00:18,330 -process is very similar. - -4 -00:00:18,330 --> 00:00:21,990 -This time, though, we're going to be using -transfer learning and doing some training - -5 -00:00:21,990 --> 00:00:26,660 -ourselves, rather than just loading a model -and using it as-is. - -6 -00:00:26,660 --> 00:00:30,610 -To learn more about transfer learning, head -to the 'What is transfer learning?' - -7 -00:00:30,610 --> 00:00:33,000 -video, which we'll link below too! - -8 -00:00:33,000 --> 00:00:35,660 -So now let's look at this code. - -9 -00:00:35,660 --> 00:00:40,340 -To start, we pick which model we want to start -with - in this case we're going to use the - -10 -00:00:40,340 --> 00:00:42,540 -famous, the original BERT. - -11 -00:00:42,540 --> 00:00:50,500 -But what does this monstrosity, 'TFAutoModelForSequenceClassification' -mean? - -12 -00:00:50,500 --> 00:00:56,460 -Well, the TF stands for TensorFlow, and the -rest means "take a language model, and stick - -13 -00:00:56,460 --> 00:01:00,879 -a sequence classification head onto it if -it doesn't have one already". - -14 -00:01:00,879 --> 00:01:05,420 -So what we're going to do here is load BERT, -a general language model, and then do some - -15 -00:01:05,420 --> 00:01:09,490 -transfer learning to use it on our task of -interest. - -16 -00:01:09,490 --> 00:01:13,530 -We load the language model with this one line -of code here, using the "from_pretrained" - -17 -00:01:13,530 --> 00:01:14,530 -method. - -18 -00:01:14,530 --> 00:01:21,230 -That method needs to know two things: Firstly -the name of the model you want it to load, - -19 -00:01:21,230 --> 00:01:29,840 -and secondly how many classes your problem -has. - -20 -00:01:29,840 --> 00:01:33,500 -If you want to follow along with the data -from our datasets videos, which I'll link - -21 -00:01:33,500 --> 00:01:41,200 -below, then you'll have two classes, positive -and negative, and thus num_labels equals two. - -22 -00:01:41,200 --> 00:01:43,590 -What about this "compile" thing? - -23 -00:01:43,590 --> 00:01:47,909 -If you're familiar with Keras, you've probably -seen this already, but if not, this is one - -24 -00:01:47,909 --> 00:01:55,520 -of its core methods - you always need to "compile" -your model before you train it. - -25 -00:01:55,520 --> 00:02:01,240 -Compile needs to know two things: Firstly, -the loss function - what are we trying to - -26 -00:02:01,240 --> 00:02:02,240 -optimize? - -27 -00:02:02,240 --> 00:02:08,509 -Here, we import the sparse categorical crossentropy -loss function - that's a mouthful, but it's - -28 -00:02:08,509 --> 00:02:13,390 -the standard loss function for any neural -network that's doing a classification task. - -29 -00:02:13,390 --> 00:02:18,170 -It basically encourages the network to output -large values for the right class, and low - -30 -00:02:18,170 --> 00:02:21,080 -values for the wrong classes. - -31 -00:02:21,080 --> 00:02:26,140 -Note that you can specify the loss function -as a string, like we did with the optimizer, - -32 -00:02:26,140 --> 00:02:34,319 -but there's a very common pitfall there - by -default, this loss assumes the output is probabilities - -33 -00:02:34,319 --> 00:02:39,650 -after a softmax layer, but what our model -has actually output is the values before the - -34 -00:02:39,650 --> 00:02:50,140 -softmax, often called the "logits" - you saw -these before in the videos about pipelines. - -35 -00:02:50,140 --> 00:02:54,580 -If you get this wrong, your model won't train -and it'll be very annoying to figure out why. - -36 -00:02:54,580 --> 00:02:58,500 -In fact, if you remember absolutely nothing -else from this video, remember to always check - -37 -00:02:58,500 --> 00:03:02,990 -whether your model is outputting logits or -probabilities, and to make sure your loss - -38 -00:03:02,990 --> 00:03:05,270 -is set up to match that. - -39 -00:03:05,270 --> 00:03:09,460 -It'll save you a lot of debugging headaches -in your career! - -40 -00:03:09,460 --> 00:03:13,340 -The second thing compile needs to know is -the optimizer you want. - -41 -00:03:13,340 --> 00:03:17,570 -In our case, we use Adam, which is sort of -the standard optimizer for deep learning these - -42 -00:03:17,570 --> 00:03:18,730 -days. - -43 -00:03:18,730 --> 00:03:22,770 -The one thing you might want to change is -the learning rate, and to do that we'll need - -44 -00:03:22,770 --> 00:03:27,330 -to import the actual optimizer rather than -just calling it by string, but we'll talk - -45 -00:03:27,330 --> 00:03:30,050 -about that in another video, which I'll link -below. - -46 -00:03:30,050 --> 00:03:33,610 -For now, let's just try training the model! - -47 -00:03:33,610 --> 00:03:35,830 -So how do you train a model? - -48 -00:03:35,830 --> 00:03:40,670 -Well, if you’ve used Keras before, this -will all be very familiar to you - but if - -49 -00:03:40,670 --> 00:03:43,370 -not, let's look at what we're doing here. - -50 -00:03:43,370 --> 00:03:48,371 -Fit() is pretty much the central method for -Keras models - it tells the model to break - -51 -00:03:48,371 --> 00:03:49,371 -the data into batches and train on it. - -52 -00:03:49,371 --> 00:03:50,371 -So the first input is tokenized text - you -will almost always be getting this from a - -53 -00:03:50,371 --> 00:03:52,120 -tokenizer, and if you want to learn more about -that process, and what exactly the outputs - -54 -00:03:52,120 --> 00:03:53,120 -look like, please check out our videos on -tokenizers - there'll be links below for those - -55 -00:03:53,120 --> 00:03:54,120 -too! - -56 -00:03:54,120 --> 00:03:55,120 -So that's our inputs, and then the second -input is our labels - this is just a one-dimensional - -57 -00:03:55,120 --> 00:03:56,840 -Numpy or Tensorflow array of integers, corresponding -to the classes for our examples, and that’s - -58 -00:03:56,840 --> 00:03:57,840 -it. - -59 -00:03:57,840 --> 00:03:58,840 -If you're following along with the data from -our datasets video, there'll only be two classes, - -60 -00:03:58,840 --> 00:04:00,300 -so this will just be zeroes and ones. - -61 -00:04:00,300 --> 00:04:04,870 -Once we have our inputs and our labels, we -do the same thing with the validation data, - -62 -00:04:04,870 --> 00:04:07,120 -we pass the validation inputs and the validation -labels in a tuple, then we can, if we want, - -63 -00:04:07,120 --> 00:04:15,390 -specify details like the batch_size for training, -and then you just pass it all to model.fit() - -64 -00:04:15,390 --> 00:04:16,540 -and let it rip. - -65 -00:04:16,540 --> 00:04:20,449 -If everything works out, you should see a -little training progress bar as your loss - -66 -00:04:20,449 --> 00:04:21,670 -goes down. - -67 -00:04:21,670 --> 00:04:26,870 -And while that's running you call your boss -and tell him you’re a senior NLP machine - -68 -00:04:26,870 --> 00:04:30,509 -learning engineer now and you’re going to -want a salary review next quarter. - -69 -00:04:30,509 --> 00:04:38,470 -This is really all it takes to apply the power -of a massive pretrained language model to - -70 -00:04:38,470 --> 00:04:40,770 -your NLP problem. - -71 -00:04:40,770 --> 00:04:42,440 -Could we do better, though? - -72 -00:04:42,440 --> 00:04:47,180 -We certainly could, with a few more advanced -Keras features like a tuned, scheduled learning - -73 -00:04:47,180 --> 00:04:50,889 -rate we can get an even lower loss, and an -even more accurate model. - -74 -00:04:50,889 --> 00:04:54,039 -And what do we do with our model once it's -trained? - -75 -00:04:54,039 --> 00:05:02,919 -I'll cover this and more in the videos linked -below! +1 +00:00:00,253 --> 00:00:02,920 +(air whooshing) + +2 +00:00:06,060 --> 00:00:08,070 +- In this video, we're going to see + +3 +00:00:08,070 --> 00:00:11,430 +how to load and fine +tune a pre-trained model. + +4 +00:00:11,430 --> 00:00:12,510 +It's very quick. + +5 +00:00:12,510 --> 00:00:14,490 +And if you've watched our pipeline videos, + +6 +00:00:14,490 --> 00:00:18,150 +which I'll link below, the +process is very similar. + +7 +00:00:18,150 --> 00:00:20,940 +This time, though, we're going +to be using transfer learning + +8 +00:00:20,940 --> 00:00:23,040 +and doing some training ourselves, + +9 +00:00:23,040 --> 00:00:26,400 +rather than just loading a +model and using it as is. + +10 +00:00:26,400 --> 00:00:28,710 +So to learn more about transfer learning, + +11 +00:00:28,710 --> 00:00:31,320 +head to the 'What is +transfer learning?' video, + +12 +00:00:31,320 --> 00:00:33,420 +and we'll link that below as well. + +13 +00:00:33,420 --> 00:00:35,610 +But for now, let's look at this code. + +14 +00:00:35,610 --> 00:00:38,730 +To start, we pick which +model we want to start with. + +15 +00:00:38,730 --> 00:00:40,920 +In this case, we're +going to use the famous, + +16 +00:00:40,920 --> 00:00:42,060 +the original BERT, + +17 +00:00:42,060 --> 00:00:44,850 +as the foundation for our training today. + +18 +00:00:44,850 --> 00:00:46,770 +But what is this monstrosity line, + +19 +00:00:46,770 --> 00:00:48,797 +this +'TFAutoModelForSequenceClassification'? + +20 +00:00:49,860 --> 00:00:51,180 +What does that mean? + +21 +00:00:51,180 --> 00:00:53,130 +Well, the TF stands for TensorFlow. + +22 +00:00:53,130 --> 00:00:54,660 +And the rest means, + +23 +00:00:54,660 --> 00:00:55,950 +take a language model, + +24 +00:00:55,950 --> 00:00:58,380 +and stick a sequence +classification head onto it + +25 +00:00:58,380 --> 00:01:00,750 +if it doesn't have one already. + +26 +00:01:00,750 --> 00:01:02,880 +So this line of code loads BERT, + +27 +00:01:02,880 --> 00:01:05,040 +which is a general purpose language model, + +28 +00:01:05,040 --> 00:01:07,650 +it loads at weights, architecture, and all + +29 +00:01:07,650 --> 00:01:10,920 +and then adds a new sequence +classification head onto it + +30 +00:01:10,920 --> 00:01:13,440 +with randomly initialized weights. + +31 +00:01:13,440 --> 00:01:15,870 +So this method needs to know two things. + +32 +00:01:15,870 --> 00:01:18,270 +Firstly, it needs to know +the name of the model + +33 +00:01:18,270 --> 00:01:21,060 +you wanted to load, the +architecture and weights for. + +34 +00:01:21,060 --> 00:01:23,940 +And secondly, it needs +to know how many classes + +35 +00:01:23,940 --> 00:01:26,693 +your problem has, because +that will determine the size, + +36 +00:01:26,693 --> 00:01:29,610 +the number of neurons in the output head. + +37 +00:01:29,610 --> 00:01:31,530 +So if you want to follow +along with the data + +38 +00:01:31,530 --> 00:01:34,500 +from our datasets videos, +which I'll link below, + +39 +00:01:34,500 --> 00:01:37,440 +then you'll have two classes, +positive and negative, + +40 +00:01:37,440 --> 00:01:39,723 +and thus num_labels equals two. + +41 +00:01:40,830 --> 00:01:43,230 +But what about this compile line? + +42 +00:01:43,230 --> 00:01:44,970 +Well, if you're familiar with Keras, + +43 +00:01:44,970 --> 00:01:46,920 +you've probably seen this already. + +44 +00:01:46,920 --> 00:01:49,800 +But if not, this is one of +the core methods in Keras + +45 +00:01:49,800 --> 00:01:51,450 +that you're gonna see again, and again. + +46 +00:01:51,450 --> 00:01:54,900 +You always need to compile +your model before you train it. + +47 +00:01:54,900 --> 00:01:57,870 +And compile needs to know two things. + +48 +00:01:57,870 --> 00:02:00,090 +Firstly, it needs to +know the loss function, + +49 +00:02:00,090 --> 00:02:02,340 +which is what you're trying to optimize. + +50 +00:02:02,340 --> 00:02:05,910 +So here, we import the +SparseCategoricalCrossentropy + +51 +00:02:05,910 --> 00:02:07,260 +loss function. + +52 +00:02:07,260 --> 00:02:09,930 +So that's a mouthful, but it's +the standard loss function + +53 +00:02:09,930 --> 00:02:13,260 +for any neural network that's +doing a classification task. + +54 +00:02:13,260 --> 00:02:14,970 +It basically encourages the network + +55 +00:02:14,970 --> 00:02:17,730 +to output large values +for the right class, + +56 +00:02:17,730 --> 00:02:20,910 +and low values for the wrong classes. + +57 +00:02:20,910 --> 00:02:24,150 +Note that you can specify the +loss function as a string, + +58 +00:02:24,150 --> 00:02:26,010 +like we did with the optimizer. + +59 +00:02:26,010 --> 00:02:27,600 +But there's a risk there, + +60 +00:02:27,600 --> 00:02:30,090 +there's a very common +trap people fall into, + +61 +00:02:30,090 --> 00:02:32,580 +which is that by default, +this loss assumes + +62 +00:02:32,580 --> 00:02:36,510 +the output is probabilities +after a softmax layer. + +63 +00:02:36,510 --> 00:02:38,310 +But what our model has actually output + +64 +00:02:38,310 --> 00:02:40,770 +is the values before the softmax, + +65 +00:02:40,770 --> 00:02:43,800 +often called the logits, sometimes logits. + +66 +00:02:43,800 --> 00:02:46,110 +No one's quite sure how +to pronounce that one. + +67 +00:02:46,110 --> 00:02:47,790 +But you probably seen these before + +68 +00:02:47,790 --> 00:02:49,950 +in the video about pipelines. + +69 +00:02:49,950 --> 00:02:52,320 +So if you get this wrong, +your model won't train + +70 +00:02:52,320 --> 00:02:54,723 +and it'll be very annoying +to figure out why. + +71 +00:02:55,590 --> 00:02:57,540 +In future videos, we're gonna see + +72 +00:02:57,540 --> 00:03:00,540 +how to use the model's +internal loss computations, + +73 +00:03:00,540 --> 00:03:02,910 +so that you don't have to +specify the loss yourself + +74 +00:03:02,910 --> 00:03:05,340 +and you don't have to +worry about these details. + +75 +00:03:05,340 --> 00:03:09,480 +But for now, remember to +set from_logits to true. + +76 +00:03:09,480 --> 00:03:11,430 +The second thing compile needs to know + +77 +00:03:11,430 --> 00:03:13,230 +is the optimizer you want. + +78 +00:03:13,230 --> 00:03:15,120 +In our case, we use adam, + +79 +00:03:15,120 --> 00:03:16,830 +which is sort of the standard optimizer + +80 +00:03:16,830 --> 00:03:18,720 +for deep learning these days. + +81 +00:03:18,720 --> 00:03:20,520 +The one thing you might want to change + +82 +00:03:20,520 --> 00:03:21,780 +is the learning rate. + +83 +00:03:21,780 --> 00:03:24,630 +And to do that, we'll need to +import the actual optimizer + +84 +00:03:24,630 --> 00:03:26,910 +rather than just calling it by string. + +85 +00:03:26,910 --> 00:03:28,680 +But we'll talk about +that in another video, + +86 +00:03:28,680 --> 00:03:30,090 +which I'll link below. + +87 +00:03:30,090 --> 00:03:33,360 +For now, let's just +try training the model. + +88 +00:03:33,360 --> 00:03:35,580 +Well, so how do you train the model? + +89 +00:03:35,580 --> 00:03:37,950 +Again, if you've used Keras before, + +90 +00:03:37,950 --> 00:03:40,350 +this is all going to be +very familiar to you. + +91 +00:03:40,350 --> 00:03:42,210 +But if not, let's very quickly look + +92 +00:03:42,210 --> 00:03:43,710 +at what we're doing here. + +93 +00:03:43,710 --> 00:03:47,010 +fit is pretty much the central +method for Keras models. + +94 +00:03:47,010 --> 00:03:49,983 +It tells the model to train +on the data we're passing in. + +95 +00:03:50,820 --> 00:03:52,920 +So here we pass the datasets we made + +96 +00:03:52,920 --> 00:03:54,510 +in the previous section, + +97 +00:03:54,510 --> 00:03:57,990 +the dataset contains both +our inputs and our labels. + +98 +00:03:57,990 --> 00:04:00,420 +So we don't need to +specify separate labels, + +99 +00:04:00,420 --> 00:04:01,570 +when we're calling fit. + +100 +00:04:02,490 --> 00:04:05,340 +Then we do the same thing +with the validation_data. + +101 +00:04:05,340 --> 00:04:08,190 +And then we can if we want, +we can specify details, + +102 +00:04:08,190 --> 00:04:09,900 +like the number of epochs for training + +103 +00:04:09,900 --> 00:04:12,420 +where there's some other +arguments you can pass to fit. + +104 +00:04:12,420 --> 00:04:15,240 +But in the end, you just +pass all of this to model.fit + +105 +00:04:15,240 --> 00:04:16,440 +and you let it run. + +106 +00:04:16,440 --> 00:04:17,520 +If everything works out, + +107 +00:04:17,520 --> 00:04:19,320 +you should see a little training bar + +108 +00:04:19,320 --> 00:04:21,300 +progressing along as your loss goes down. + +109 +00:04:21,300 --> 00:04:22,290 +And that's it. + +110 +00:04:22,290 --> 00:04:23,123 +While that's running, + +111 +00:04:23,123 --> 00:04:25,380 +you know, you can call +your boss and tell them + +112 +00:04:25,380 --> 00:04:27,810 +you're a senior NLP machine +learning engineer now + +113 +00:04:27,810 --> 00:04:30,900 +and you're gonna want a +salary review next quarter. + +114 +00:04:30,900 --> 00:04:32,880 +These few lines of code +are really all it takes + +115 +00:04:32,880 --> 00:04:34,500 +to apply the power of a massive + +116 +00:04:34,500 --> 00:04:36,510 +pre-trained language problem, + +117 +00:04:36,510 --> 00:04:38,250 +massive pre-trained +language model, excuse me, + +118 +00:04:38,250 --> 00:04:40,080 +to your NLP problem. + +119 +00:04:40,080 --> 00:04:42,150 +But could we do better than this? + +120 +00:04:42,150 --> 00:04:43,920 +I mean, we certainly could. + +121 +00:04:43,920 --> 00:04:45,720 +With a few more advanced Keras features + +122 +00:04:45,720 --> 00:04:47,730 +like a tuned, scheduled learning rate, + +123 +00:04:47,730 --> 00:04:49,290 +we can get an even lower loss + +124 +00:04:49,290 --> 00:04:51,990 +and an even more accurate, +more useful model. + +125 +00:04:51,990 --> 00:04:54,120 +And what do we do with our +model after we train it? + +126 +00:04:54,120 --> 00:04:55,950 +So all of this is going to +be covered in the videos + +127 +00:04:55,950 --> 00:04:57,963 +that are coming up, so stay tuned. + +128 +00:04:59,220 --> 00:05:01,887 +(air whooshing) + diff --git a/subtitles/en/27_learning-rate-scheduling-with-tensorflow.srt b/subtitles/en/27_learning-rate-scheduling-with-tensorflow.srt index e805ae6dd..4a5688bea 100644 --- a/subtitles/en/27_learning-rate-scheduling-with-tensorflow.srt +++ b/subtitles/en/27_learning-rate-scheduling-with-tensorflow.srt @@ -1,179 +1,468 @@ -1 -00:00:05,120 --> 00:00:11,440 -In our other videos we talked about the basics  -of fine-tuning a language model with Tensorflow   - -2 -00:00:11,440 --> 00:00:18,000 -(and as always, when I refer to videos I'll link  -them below). Still, can we do better? So here's   - -3 -00:00:18,000 --> 00:00:23,040 -the code from our model fine-tuning video, and  -while it works, we could definitely tweak a couple   - -4 -00:00:23,040 --> 00:00:29,040 -of things. By far the most important thing is the  -learning rate. In this video we'll talk about how   - -5 -00:00:29,040 --> 00:00:34,800 -to change it, which will make your training  -much more consistently successful. In fact,   - -6 -00:00:36,080 --> 00:00:42,880 -there are two things we want to change about the  -default learning rate for Adam. The first is that   - -7 -00:00:42,880 --> 00:00:51,520 -it's way too high for our models - by default  -Adam uses a learning rate of 10^-3 1 e minus 3,   - -8 -00:00:51,520 --> 00:00:59,600 -which is very high for training Transformers.  -We're going to start at 5 by 10^-5 5 e minus 5,   - -9 -00:00:59,600 --> 00:01:05,520 -which is 20 times lower than the default. And  -secondly, we don't just want a constant learning   - -10 -00:01:05,520 --> 00:01:10,960 -rate - we can get even better performance if we  -'decay' the learning rate down to a tiny value,   - -11 -00:01:10,960 --> 00:01:17,760 -or even 0, over the course of training. That's  -what this PolynomialDecay schedule thing is doing.   - -12 -00:01:19,200 --> 00:01:20,880 -That name might be intimidating, especially  -if you only vaguely remember what a polynomial   - -13 -00:01:21,600 --> 00:01:25,120 -is from maths class. However, all we need to  -do is tell it how long training is going to be,   - -14 -00:01:25,120 --> 00:01:29,040 -so it decays at the right speed -  -that's what this code here is doing.   - -15 -00:01:30,080 --> 00:01:35,280 -We're computing how many minibatches the model  -is going to see over its entire training run,   - -16 -00:01:35,280 --> 00:01:37,640 -which is the size of the training set, divided  -by the batch_size to get the number of batches   - -17 -00:01:37,640 --> 00:01:42,080 -per epoch, and then multiplied by the  -number of epochs to get the total number   - -18 -00:01:42,080 --> 00:01:47,680 -of batches across the whole training run. Once  -we know how many training steps we're taking,   - -19 -00:01:47,680 --> 00:01:51,360 -we just pass all that information to  -the scheduler and we're ready to go.   - -20 -00:01:54,000 --> 00:01:57,360 -What does the polynomial decay schedule look  -like? With default options, it's actually just a   - -21 -00:01:57,360 --> 00:02:04,720 -linear schedule, so it looks like this - it starts  -at 5e-5, which means 5 times ten to the minus 5,   - -22 -00:02:05,280 --> 00:02:11,120 -and then decays down at a constant rate until  -it hits zero right at the very end of training.   - -23 -00:02:11,120 --> 00:02:33,920 -So why do they call it polynomial and not  -linear? Because if you tweak the options,   - -24 -00:02:36,000 --> 00:02:49,840 -you can get a higher-order decay schedule, but  -there's no need to do that right now. Now, how   - -25 -00:02:49,840 --> 00:02:56,400 -do we use our learning rate schedule? Easy,  -we just pass it to Adam! You'll notice the   - -26 -00:02:56,400 --> 00:03:00,480 -first time when we compiled the model,  -we just passed it the string "adam".   - -27 -00:03:02,320 --> 00:03:07,760 -Keras recognizes the names of common optimizers  -and loss functions if you pass them as strings,   - -28 -00:03:07,760 --> 00:03:12,320 -so it saves time to do that if you only want  -the default settings. But we're professional   - -29 -00:03:12,320 --> 00:03:19,600 -machine learners now, with our very own learning  -rate schedule, so we have to do things properly.   - -30 -00:03:19,600 --> 00:03:26,080 -So first we import the optimizer, then we  -initialize it with our scheduler, and then   - -31 -00:03:29,200 --> 00:03:34,720 -we compile the model using the new optimizer,  -and whatever loss function you want - this will   - -32 -00:03:34,720 --> 00:03:39,040 -be sparse categorical crossentropy if you're  -following along from the fine-tuning video.   - -33 -00:03:39,680 --> 00:03:47,120 -And now we have a high-performance model, ready to  -go. All that remains is to fit the model just like   - -34 -00:03:47,120 --> 00:03:53,280 -we did before! Remember, because we compiled the  -model with the new optimizer with the new learning   - -35 -00:03:53,280 --> 00:03:58,800 -rate schedule, we don't need to change anything  -here. We just call fit again, with exactly the   - -36 -00:03:58,800 --> 00:04:04,320 -same command as before, but now we get beautiful  -training with a nice, smooth learning rate decay. +1 +00:00:00,288 --> 00:00:02,639 +(screen swishing) + +2 +00:00:02,639 --> 00:00:05,190 +(text swishing) + +3 +00:00:05,190 --> 00:00:06,780 +In our other videos, + +4 +00:00:06,780 --> 00:00:08,280 +we talked about the basics + +5 +00:00:08,280 --> 00:00:11,610 +of fine-tuning a language +model with Tensorflow, + +6 +00:00:11,610 --> 00:00:15,030 +and as always, when I refer to +videos I'll link them below. + +7 +00:00:15,030 --> 00:00:17,610 +Still, can we do better? + +8 +00:00:17,610 --> 00:00:20,700 +So here's the code from our +model fine-tuning video, + +9 +00:00:20,700 --> 00:00:21,600 +and while it works, + +10 +00:00:21,600 --> 00:00:24,390 +we could definitely +tweak a couple of things. + +11 +00:00:24,390 --> 00:00:27,540 +By far the most important +thing is the learning rate. + +12 +00:00:27,540 --> 00:00:29,940 +In this video we'll talk +about how to change it, + +13 +00:00:29,940 --> 00:00:31,080 +which will make your training + +14 +00:00:31,080 --> 00:00:33,303 +much more consistently successful. + +15 +00:00:34,440 --> 00:00:37,320 +In fact, really there are two things + +16 +00:00:37,320 --> 00:00:40,530 +we want to change about the +default learning rate for Adam. + +17 +00:00:40,530 --> 00:00:42,720 +So the first we want to change + +18 +00:00:42,720 --> 00:00:45,630 +is that it's way too high for our models, + +19 +00:00:45,630 --> 00:00:48,030 +by default, Adam uses a learning rate + +20 +00:00:48,030 --> 00:00:51,540 +of 10 to the minus 3, 1 E minus 3, + +21 +00:00:51,540 --> 00:00:54,660 +and that's very high for +training transformer models. + +22 +00:00:54,660 --> 00:00:58,200 +We're going to start at +5 by 10 to the minus 5, + +23 +00:00:58,200 --> 00:01:02,700 +5 E minus 5, which is 20 +times lower than the default. + +24 +00:01:02,700 --> 00:01:06,330 +And secondly, we don't just +want a constant learning rate, + +25 +00:01:06,330 --> 00:01:07,950 +we can get even better performance + +26 +00:01:07,950 --> 00:01:11,160 +if we decay the learning +rate down to a tiny value, + +27 +00:01:11,160 --> 00:01:13,920 +or even to zero , over +the course of training. + +28 +00:01:13,920 --> 00:01:15,510 +So that's what this thing here, + +29 +00:01:15,510 --> 00:01:18,540 +this Polynomial Decay +schedule thing is doing. + +30 +00:01:18,540 --> 00:01:21,570 +So I'll show you what that +decay looks like in a second, + +31 +00:01:21,570 --> 00:01:23,160 +but first we need to tell the scheduler + +32 +00:01:23,160 --> 00:01:25,290 +how long training is going to be, + +33 +00:01:25,290 --> 00:01:27,450 +so that it decays at the right speed, + +34 +00:01:27,450 --> 00:01:29,450 +and that's what this code here is doing. + +35 +00:01:30,300 --> 00:01:32,280 +We're computing how many minibatches + +36 +00:01:32,280 --> 00:01:35,520 +the model is going to see +over its entire training run, + +37 +00:01:35,520 --> 00:01:37,950 +which is the size of the training set, + +38 +00:01:37,950 --> 00:01:39,570 +and then we multiply that + +39 +00:01:39,570 --> 00:01:41,220 +by the number of epochs + +40 +00:01:41,220 --> 00:01:42,930 +to get the total number of batches + +41 +00:01:42,930 --> 00:01:45,060 +across the whole training run. + +42 +00:01:45,060 --> 00:01:47,880 +Once we know how many +training steps we're taking, + +43 +00:01:47,880 --> 00:01:50,580 +we just pass all that +information to the scheduler + +44 +00:01:50,580 --> 00:01:51,783 +and we're ready to go. + +45 +00:01:53,110 --> 00:01:57,510 +What does the polynomial +decay schedule look like? + +46 +00:01:57,510 --> 00:01:59,610 +Well, it looks like this, + +47 +00:01:59,610 --> 00:02:02,160 +it starts at 5 E minus 5, + +48 +00:02:02,160 --> 00:02:05,490 +which means 5 times 10 to the minus 5, + +49 +00:02:05,490 --> 00:02:08,190 +and then decays down at a constant rate + +50 +00:02:08,190 --> 00:02:11,310 +until it hits zero right at +the very end of training. + +51 +00:02:11,310 --> 00:02:13,200 +So hold on, I can already hear you + +52 +00:02:13,200 --> 00:02:14,640 +yelling at your monitor, though, + +53 +00:02:14,640 --> 00:02:16,020 +and yes, I know, + +54 +00:02:16,020 --> 00:02:18,690 +this is actually constant +or a linear decay, + +55 +00:02:18,690 --> 00:02:20,310 +and I know the name is polynomial, + +56 +00:02:20,310 --> 00:02:21,870 +and you're feeling cheated that, you know, + +57 +00:02:21,870 --> 00:02:24,390 +you were promised a polynomial +and haven't gotten it, + +58 +00:02:24,390 --> 00:02:26,550 +so calm down though, it's okay, + +59 +00:02:26,550 --> 00:02:28,830 +because, of course, +linear functions are just + +60 +00:02:28,830 --> 00:02:30,480 +first-order special cases + +61 +00:02:30,480 --> 00:02:32,850 +of the general polynomial functions, + +62 +00:02:32,850 --> 00:02:36,180 +and if you tweak the +options to this class, + +63 +00:02:36,180 --> 00:02:38,130 +you can get a truly polynomial, + +64 +00:02:38,130 --> 00:02:40,170 +a higher-order decay schedule, + +65 +00:02:40,170 --> 00:02:43,140 +but this linear schedule will +work fine for us for now, + +66 +00:02:43,140 --> 00:02:45,210 +we don't actually need all those + +67 +00:02:45,210 --> 00:02:47,610 +fancy tweaks and fancy gadgets. + +68 +00:02:47,610 --> 00:02:49,770 +So coming back, + +69 +00:02:49,770 --> 00:02:51,990 +how do we actually use +this learning rate schedule + +70 +00:02:51,990 --> 00:02:53,460 +once we've created it? + +71 +00:02:53,460 --> 00:02:55,650 +So it's simple, we just pass it to Adam. + +72 +00:02:55,650 --> 00:02:58,560 +So the first time we compiled the model, + +73 +00:02:58,560 --> 00:03:00,840 +we just passed the string Adam, + +74 +00:03:00,840 --> 00:03:02,250 +to get our optimizer. + +75 +00:03:02,250 --> 00:03:05,340 +So Keras recognizes the +names of common optimizers + +76 +00:03:05,340 --> 00:03:07,920 +and loss functions if +you pass them as strings, + +77 +00:03:07,920 --> 00:03:09,480 +so it saves time to do that + +78 +00:03:09,480 --> 00:03:11,460 +if you only want the default settings. + +79 +00:03:11,460 --> 00:03:13,320 +But now we're professional +machine learners, + +80 +00:03:13,320 --> 00:03:15,720 +and, you know, that +salary review is upcoming, + +81 +00:03:15,720 --> 00:03:17,790 +so we've got our very own +learning rate schedule, + +82 +00:03:17,790 --> 00:03:19,770 +and we're gonna do things properly. + +83 +00:03:19,770 --> 00:03:22,830 +So the first we do is +we import the optimizer, + +84 +00:03:22,830 --> 00:03:24,960 +and then we initialize +it with a scheduler, + +85 +00:03:24,960 --> 00:03:27,540 +which is getting passed to +to the learning rate argument + +86 +00:03:27,540 --> 00:03:29,100 +of that optimizer. + +87 +00:03:29,100 --> 00:03:32,190 +And now we compile the model +with this new optimizer, + +88 +00:03:32,190 --> 00:03:34,140 +and again, whatever +loss function you want, + +89 +00:03:34,140 --> 00:03:37,050 +so this is going to be sparse +categorical crossentropy + +90 +00:03:37,050 --> 00:03:39,840 +if you're following along +from the fine-tuning video. + +91 +00:03:39,840 --> 00:03:41,370 +And then, we're we're ready to go, + +92 +00:03:41,370 --> 00:03:43,710 +now we have a high-performance model, + +93 +00:03:43,710 --> 00:03:44,970 +and ready for training. + +94 +00:03:44,970 --> 00:03:46,830 +All that remains is to fit the model + +95 +00:03:46,830 --> 00:03:48,363 +just like we did before. + +96 +00:03:49,350 --> 00:03:51,600 +Remember, because we compiled the model + +97 +00:03:51,600 --> 00:03:54,300 +with the new optimizer and the +new learning rate schedule, + +98 +00:03:54,300 --> 00:03:56,190 +we actually don't need +to change anything at all + +99 +00:03:56,190 --> 00:03:57,360 +when we call fit, + +100 +00:03:57,360 --> 00:03:58,290 +we just call it again, + +101 +00:03:58,290 --> 00:04:00,540 +with exactly the same command as before, + +102 +00:04:00,540 --> 00:04:02,400 +but now we get a beautiful training, + +103 +00:04:02,400 --> 00:04:04,740 +with a nice, smooth learning rate decay, + +104 +00:04:04,740 --> 00:04:06,330 +starting from a good value, + +105 +00:04:06,330 --> 00:04:07,713 +and decaying down to zero. + +106 +00:04:08,867 --> 00:04:12,059 +(screen swishing) + +107 +00:04:12,059 --> 00:04:13,395 +(screen swishing) + diff --git a/subtitles/en/28_tensorflow-predictions-and-metrics.srt b/subtitles/en/28_tensorflow-predictions-and-metrics.srt index ac82f1437..24f75a7d7 100644 --- a/subtitles/en/28_tensorflow-predictions-and-metrics.srt +++ b/subtitles/en/28_tensorflow-predictions-and-metrics.srt @@ -1,194 +1,461 @@ -1 -00:00:05,600 --> 00:00:10,080 -In our other videos, and as always, there'll  -be links below if you want to check those out,   - -2 -00:00:10,640 --> 00:00:15,600 -we showed you how to initialize and  -fine-tune a transformer model in TensorFlow,   - -3 -00:00:15,600 --> 00:00:20,800 -so the question now is: What can we do with a  -model after we train it? The obvious thing to   - -4 -00:00:20,800 --> 00:00:26,080 -try is to use it to get predictions for new  -data, so let's see how to do that. Again,   - -5 -00:00:26,080 --> 00:00:31,120 -if you're familiar with Keras, the good news is  -that because there are just standard Keras models,   - -6 -00:00:31,680 --> 00:00:35,440 -we can use the standard Keras  -predict() method, as shown here.   - -7 -00:00:36,800 --> 00:00:42,800 -You simply pass in tokenized text to this method,  -like you'd get from a tokenizer, and you get your   - -8 -00:00:42,800 --> 00:00:48,320 -results. Our models can output several different  -things, depending on the options you set,   - -9 -00:00:48,320 --> 00:00:53,280 -but most of the time the thing you want is the  -output logits. If you haven’t come across them   - -10 -00:00:53,280 --> 00:01:02,960 -before, logits are the outputs of the last layer  -of the network, before a softmax has been applied.   - -11 -00:01:02,960 --> 00:01:08,400 -So if you want to turn the logits into the model’s  -probability outputs, you just apply a softmax,   - -12 -00:01:08,400 --> 00:01:13,840 -like so. What if we want to turn those  -probabilities into class predictions?   - -13 -00:01:14,853 --> 00:01:20,960 -Simple, we just pick the biggest probability for  -each output! The easiest way to do that is with   - -14 -00:01:20,960 --> 00:01:26,960 -the argmax function. Argmax will return the  -index of the largest probability in each row,   - -15 -00:01:26,960 --> 00:01:36,400 -which means in this case that we’ll  -get a vector of 0 and 1 values.   - -16 -00:01:37,360 --> 00:01:45,440 -Those are our class predictions! In fact, if  -class predictions are all you want, you can skip   - -17 -00:01:45,440 --> 00:01:50,240 -the softmax step entirely, because the largest  -logit will always be the largest probability   - -18 -00:01:52,400 --> 00:01:56,800 -too. If probabilities and class predictions are  -all you want, then you’ve seen everything you   - -19 -00:01:56,800 --> 00:02:02,000 -need at this point! But if you’re interested in  -benchmarking your model or using it for research,   - -20 -00:02:02,000 --> 00:02:06,320 -you might want to delve deeper into the results  -you get. And one way to do that is to compute   - -21 -00:02:06,320 --> 00:02:10,880 -some metrics for the model’s predictions. If  -you're following along with our datasets and   - -22 -00:02:10,880 --> 00:02:16,400 -fine-tuning videos, we got our data from the MRPC  -dataset, which is part of the GLUE benchmark.   - -23 -00:02:16,960 --> 00:02:24,480 -Each of the GLUE datasets, as well as many of  -our other datasets, has some predefined metrics,   - -24 -00:02:24,480 --> 00:02:31,520 -and we can load them easily with the datasets  -load_metric() function. For the MRPC dataset,   - -25 -00:02:31,520 --> 00:02:36,080 -the built-in metrics are accuracy, which just  -measures the percentage of the time the model’s   - -26 -00:02:36,080 --> 00:02:42,160 -prediction was correct, and the F1 score, which is  -a slightly more complex measure that measures how   - -27 -00:02:42,160 --> 00:02:48,960 -well the model trades off precision and recall.  -To compute those metrics to benchmark our model,   - -28 -00:02:48,960 --> 00:02:54,000 -we just pass them the model’s predictions, and  -the ground truth labels, and we get our results.   - -29 -00:02:56,720 --> 00:03:01,120 -If you’re familiar with Keras, though, you’ll  -notice that this is a weird way to compute   - -30 -00:03:01,120 --> 00:03:06,880 -metrics - we’re only computing metrics at the end  -of training, but Keras has the built-in ability to   - -31 -00:03:06,880 --> 00:03:14,960 -compute a wide range of metrics on the fly while  -you're training. If you want to use built-in   - -32 -00:03:14,960 --> 00:03:21,920 -metric computations, it's very straightforward -  -you just pass a 'metric' argument to compile().   - -33 -00:03:22,960 --> 00:03:28,240 -As with things like loss and optimizer, you  -can specify the metrics you want by string,   - -34 -00:03:28,240 --> 00:03:33,520 -or you can import the actual metric objects if you  -want to pass specific arguments to them, but note   - -35 -00:03:33,520 --> 00:03:40,880 -that unlike loss and accuracy, you have to supply  -a list of metrics, even if you only have one. Once   - -36 -00:03:40,880 --> 00:03:46,320 -a model has been compiled with a metric, it will  -report that metric for training, validation and   - -37 -00:03:49,840 --> 00:03:54,880 -predictions. You can even write your own Metric  -classes. Though this is a bit beyond the scope   - -38 -00:03:54,880 --> 00:03:59,440 -of this course, I'll link to the relevant  -TF docs below because it can be very handy   - -39 -00:03:59,440 --> 00:04:10,800 -if you want a metric that isn't supported  -by default in Keras, such as the F1 score. +1 +00:00:00,269 --> 00:00:02,936 +(air whooshing) + +2 +00:00:05,700 --> 00:00:07,110 +- In our other videos, + +3 +00:00:07,110 --> 00:00:09,000 +and as always, there'll be links below + +4 +00:00:09,000 --> 00:00:10,740 +if you want to check those out, + +5 +00:00:10,740 --> 00:00:13,230 +we showed you how to +initialize and fine-tune + +6 +00:00:13,230 --> 00:00:15,690 +a transformer model in TensorFlow. + +7 +00:00:15,690 --> 00:00:18,600 +So the question now is, +what can we do with a model + +8 +00:00:18,600 --> 00:00:20,070 +after we train it? + +9 +00:00:20,070 --> 00:00:21,390 +The obvious thing to try + +10 +00:00:21,390 --> 00:00:23,790 +is to use it to get +predictions for new data, + +11 +00:00:23,790 --> 00:00:25,560 +so let's see how to do that. + +12 +00:00:25,560 --> 00:00:28,320 +Again, if you're familiar +with Keras, the good news + +13 +00:00:28,320 --> 00:00:31,860 +is that because there are +just standard Keras models, + +14 +00:00:31,860 --> 00:00:34,770 +we can use the standard +Keras predict method, + +15 +00:00:34,770 --> 00:00:35,883 +as shown here. + +16 +00:00:36,990 --> 00:00:40,560 +You simply pass in tokenized +text to this method, + +17 +00:00:40,560 --> 00:00:42,330 +like you'd get from a tokenizer, + +18 +00:00:42,330 --> 00:00:44,280 +and you get your results. + +19 +00:00:44,280 --> 00:00:46,740 +Our models can output +several different things, + +20 +00:00:46,740 --> 00:00:48,510 +depending on the options you set, + +21 +00:00:48,510 --> 00:00:50,310 +but most of the time the thing you want + +22 +00:00:50,310 --> 00:00:52,290 +is the output logits. + +23 +00:00:52,290 --> 00:00:54,900 +If you haven't come +across them before logits, + +24 +00:00:54,900 --> 00:00:57,630 +sometimes pronounced to +logits because no one's sure, + +25 +00:00:57,630 --> 00:01:00,390 +are the outputs of the +last layer of the network + +26 +00:01:00,390 --> 00:01:03,150 +because before a softmax has been applied. + +27 +00:01:03,150 --> 00:01:04,710 +So if you want to turn the logits + +28 +00:01:04,710 --> 00:01:06,900 +into the model's probability outputs, + +29 +00:01:06,900 --> 00:01:09,423 +you just apply a softmax, like so. + +30 +00:01:10,981 --> 00:01:12,630 +What if we want to turn +those probabilities + +31 +00:01:12,630 --> 00:01:14,370 +into class predictions? + +32 +00:01:14,370 --> 00:01:16,410 +Again, it's very straightforward. + +33 +00:01:16,410 --> 00:01:19,470 +We just pick the biggest +probability for each output + +34 +00:01:19,470 --> 00:01:23,070 +and you can get that immediately +with the argmax function. + +35 +00:01:23,070 --> 00:01:24,870 +argmax will return the index + +36 +00:01:24,870 --> 00:01:27,120 +of the largest probability in each row + +37 +00:01:27,120 --> 00:01:30,360 +which means that we'll +get a vector of integers. + +38 +00:01:30,360 --> 00:01:34,950 +So zero if the largest probability +was in the zero position, + +39 +00:01:34,950 --> 00:01:37,350 +one in the first position, and so on. + +40 +00:01:37,350 --> 00:01:40,380 +So these are our class +predictions indicating class zero, + +41 +00:01:40,380 --> 00:01:42,300 +class one, and so on. + +42 +00:01:42,300 --> 00:01:45,090 +In fact, if class +predictions are all you want, + +43 +00:01:45,090 --> 00:01:47,310 +you can skip the softmax step entirely + +44 +00:01:47,310 --> 00:01:49,740 +because the largest logit +will always be the largest + +45 +00:01:49,740 --> 00:01:51,303 +probability as well. + +46 +00:01:52,500 --> 00:01:55,800 +So if probabilities and class +predictions are all you want, + +47 +00:01:55,800 --> 00:01:58,350 +then you've seen everything +you need at this point. + +48 +00:01:58,350 --> 00:02:00,630 +But if you're interested +in benchmarking your model + +49 +00:02:00,630 --> 00:02:02,190 +or using it for research, + +50 +00:02:02,190 --> 00:02:05,010 +you might want to delve deeper +into the results you get. + +51 +00:02:05,010 --> 00:02:07,230 +And one way to do that is +to compute some metrics + +52 +00:02:07,230 --> 00:02:09,060 +for the model's predictions. + +53 +00:02:09,060 --> 00:02:10,920 +If you're following +along with our datasets + +54 +00:02:10,920 --> 00:02:12,390 +and fine tuning videos, + +55 +00:02:12,390 --> 00:02:14,850 +we got our data from the MRPC dataset, + +56 +00:02:14,850 --> 00:02:17,130 +which is part of the GLUE benchmark. + +57 +00:02:17,130 --> 00:02:19,050 +Each of the GLUE datasets + +58 +00:02:19,050 --> 00:02:22,560 +as well as many other datasets +in our dataset, Light Hub + +59 +00:02:22,560 --> 00:02:24,510 +has some predefined metrics, + +60 +00:02:24,510 --> 00:02:26,940 +and we can load them easily + +61 +00:02:26,940 --> 00:02:29,880 +with the datasets load metric function. + +62 +00:02:29,880 --> 00:02:33,720 +For the MRPC dataset, the +built-in metrics are accuracy + +63 +00:02:33,720 --> 00:02:35,790 +which just measures the +percentage of the time + +64 +00:02:35,790 --> 00:02:37,830 +the model's prediction was correct, + +65 +00:02:37,830 --> 00:02:39,780 +and the F1 score, + +66 +00:02:39,780 --> 00:02:41,610 +which is a slightly more complex measure + +67 +00:02:41,610 --> 00:02:43,920 +that measures how well +the model trades off + +68 +00:02:43,920 --> 00:02:45,543 +precision and recall. + +69 +00:02:46,470 --> 00:02:49,110 +To compute those metrics +to benchmark our model, + +70 +00:02:49,110 --> 00:02:51,480 +we just pass them the model's predictions, + +71 +00:02:51,480 --> 00:02:53,220 +and to the ground truth labels, + +72 +00:02:53,220 --> 00:02:56,880 +and we get our results in a +straightforward Python dict. + +73 +00:02:56,880 --> 00:02:58,740 +If you're familiar with Keras though, + +74 +00:02:58,740 --> 00:03:00,870 +you might notice that this +is a slightly weird way + +75 +00:03:00,870 --> 00:03:01,800 +to compute metrics, + +76 +00:03:01,800 --> 00:03:02,970 +because we're only computing metrics + +77 +00:03:02,970 --> 00:03:04,440 +at the very end of training. + +78 +00:03:04,440 --> 00:03:06,480 +But in Keras, you have +this built-in ability + +79 +00:03:06,480 --> 00:03:08,790 +to compute a wide range of metrics + +80 +00:03:08,790 --> 00:03:10,470 +on the fly while you're training, + +81 +00:03:10,470 --> 00:03:11,910 +which gives you a very useful insight + +82 +00:03:11,910 --> 00:03:13,740 +into how training is going. + +83 +00:03:13,740 --> 00:03:15,900 +So if you want to use built-in metrics, + +84 +00:03:15,900 --> 00:03:17,280 +it's very straightforward + +85 +00:03:17,280 --> 00:03:19,350 +and you use the standard +Keras approach again. + +86 +00:03:19,350 --> 00:03:23,160 +You just pass a metric +argument to the compile method. + +87 +00:03:23,160 --> 00:03:25,740 +As with things like loss and optimizer, + +88 +00:03:25,740 --> 00:03:28,470 +you can specify the +metrics you want by string + +89 +00:03:28,470 --> 00:03:30,810 +or you can import the +actual metric objects + +90 +00:03:30,810 --> 00:03:33,240 +and pass specific arguments to them. + +91 +00:03:33,240 --> 00:03:35,610 +But note that unlike loss and accuracy, + +92 +00:03:35,610 --> 00:03:37,710 +you have to supply metrics as a list + +93 +00:03:37,710 --> 00:03:39,760 +even if there's only one metric you want. + +94 +00:03:40,770 --> 00:03:43,140 +Once a model has been +compiled with a metric, + +95 +00:03:43,140 --> 00:03:45,360 +it will report that metric for training, + +96 +00:03:45,360 --> 00:03:47,643 +validation, and predictions. + +97 +00:03:48,480 --> 00:03:50,820 +Assuming there are labels +passed to the predictions. + +98 +00:03:50,820 --> 00:03:53,400 +You can even write your +own metric classes. + +99 +00:03:53,400 --> 00:03:55,920 +Although this is a bit beyond +the scope of this course, + +100 +00:03:55,920 --> 00:03:58,200 +I'll link to the relevant TF docs below + +101 +00:03:58,200 --> 00:03:59,580 +because it can be very handy + +102 +00:03:59,580 --> 00:04:01,320 +if you want a metric that isn't supported + +103 +00:04:01,320 --> 00:04:02,850 +by default in Keras, + +104 +00:04:02,850 --> 00:04:04,473 +such as the F1 score. + +105 +00:04:06,076 --> 00:04:08,743 +(air whooshing) + diff --git a/subtitles/en/29_write-your-training-loop-in-pytorch.srt b/subtitles/en/29_write-your-training-loop-in-pytorch.srt index dddf45af7..a517fd436 100644 --- a/subtitles/en/29_write-your-training-loop-in-pytorch.srt +++ b/subtitles/en/29_write-your-training-loop-in-pytorch.srt @@ -1,332 +1,536 @@ -1 -00:00:05,430 --> 00:00:07,240 -Write your own training loop in PyTorch. - -2 -00:00:07,240 --> 00:00:11,759 -In this video, we will look at how we can -do the same fine-tuning as in the Trainer - -3 -00:00:11,759 --> 00:00:14,120 -video, but without relying on that class. - -4 -00:00:14,120 --> 00:00:20,369 -This way you will be able to easily customize -each step of the training loop to your needs. - -5 -00:00:20,369 --> 00:00:23,859 -This is also very useful to manually debug -something that went wrong with the Trainer - -6 -00:00:23,859 --> 00:00:26,189 -API. - -7 -00:00:26,189 --> 00:00:31,200 -Before we dive into the code, here is a sketch -of a training loop: we take a batch of training - -8 -00:00:31,200 --> 00:00:33,469 -data and feed it to the model. - -9 -00:00:33,469 --> 00:00:36,600 -With the labels, we can then compute a loss. - -10 -00:00:36,600 --> 00:00:41,130 -That number is not useful on its own, but -is used to compute the gradients of our model - -11 -00:00:41,130 --> 00:00:46,750 -weights, that is the derivative of the loss -with respect to each model weight. - -12 -00:00:46,750 --> 00:00:51,920 -Those gradients are then used by the optimizer -to update the model weights and make them - -13 -00:00:51,920 --> 00:00:53,360 -a little bit better. - -14 -00:00:53,360 --> 00:00:56,170 -We then repeat the process with a new batch -of training data. - -15 -00:00:56,170 --> 00:01:00,969 -If any of this is unclear, don't hesitate -to take a refresher on your favorite deep - -16 -00:01:00,969 --> 00:01:02,240 -learning course. - -17 -00:01:02,240 --> 00:01:07,560 -We will use the GLUE MRPC dataset here again, -and we have seen how to preprocess the data - -18 -00:01:07,560 --> 00:01:10,439 -using the Datasets library with dynamic padding. - -19 -00:01:10,439 --> 00:01:15,549 -Checkout the videos linked below if you haven't -seen them already. - -20 -00:01:15,549 --> 00:01:20,060 -With this done, we only have to define PyTorch -DataLoaders, which will be responsible to - -21 -00:01:20,060 --> 00:01:24,480 -convert the elements of our dataset into batches. - -22 -00:01:24,480 --> 00:01:33,890 -We use our DataCollatorForPadding as the collate -function, and shuffle the training set. - -23 -00:01:33,890 --> 00:01:39,460 -To check that everything works as intended, -we try to grab a batch of data and inspect - -24 -00:01:39,460 --> 00:01:40,460 -it. - -25 -00:01:40,460 --> 00:01:44,790 -Like our dataset elements, it's a dictionary, -but this time the values are not a single - -26 -00:01:44,790 --> 00:01:50,460 -list of integers, but a tensor of shape batch -size by sequence length. - -27 -00:01:50,460 --> 00:01:52,869 -The next step is to send the training data -in our model. - -28 -00:01:52,869 --> 00:01:56,790 -For that, we will need to create our model. - -29 -00:01:56,790 --> 00:02:01,240 -As seen in the model API video, we use the -from_pretrained method and adjust the number - -30 -00:02:01,240 --> 00:02:06,159 -of labels to the number of classes we have -on this dataset, here two. - -31 -00:02:06,159 --> 00:02:11,020 -Again, to be sure everything is going well, -we pass the batch we grabbed to our model - -32 -00:02:11,020 --> 00:02:12,640 -and check there is no error. - -33 -00:02:12,640 --> 00:02:17,780 -If the labels are provided, the models of -the Transformers library always return the - -34 -00:02:17,780 --> 00:02:18,840 -loss directly. - -35 -00:02:18,840 --> 00:02:24,129 -We will be able to do loss.backward() to compute -all the gradients, and will then need an optimizer - -36 -00:02:24,129 --> 00:02:26,480 -to do the training step. - -37 -00:02:26,480 --> 00:02:30,800 -We use the AdamW optimizer here, which is -a variant of Adam with proper weight decay, - -38 -00:02:30,800 --> 00:02:35,040 -but you can pick any PyTorch optimizer you -like. - -39 -00:02:35,040 --> 00:02:39,519 -Using the previous loss and computing the -gradients with loss.backward(), we check that - -40 -00:02:39,519 --> 00:02:43,510 -we can do the optimizer step without any error. - -41 -00:02:43,510 --> 00:02:47,580 -Don't forget to zero your gradient afterward, -or at the next step they will get added to - -42 -00:02:47,580 --> 00:02:49,659 -the gradients you compute! - -43 -00:02:49,659 --> 00:02:53,620 -We could already write our training loop, -but we will add two more things to make it - -44 -00:02:53,620 --> 00:02:55,590 -as good as it can be. - -45 -00:02:55,590 --> 00:03:01,150 -The first one is a learning rate scheduler, -to progressively decay our learning rate to - -46 -00:03:01,150 --> 00:03:02,150 -zero. - -47 -00:03:02,150 --> 00:03:06,180 -The get_scheduler function from the Transformers -library is just a convenience function to - -48 -00:03:06,180 --> 00:03:12,760 -easily build such a scheduler, you can again -use any PyTorch learning rate scheduler instead. - -49 -00:03:12,760 --> 00:03:17,299 -Finally, if we want our training to take a -couple of minutes instead of a few hours, - -50 -00:03:17,299 --> 00:03:19,580 -we will need to use a GPU. - -51 -00:03:19,580 --> 00:03:24,340 -The first step is to get one, for instance -by using a colab notebook. - -52 -00:03:24,340 --> 00:03:29,090 -Then you need to actually send your model -and training data on it by using a torch device. - -53 -00:03:29,090 --> 00:03:35,659 -Double-check the following lines print a CUDA -device for you! - -54 -00:03:35,659 --> 00:03:38,450 -We can now put everything together! - -55 -00:03:38,450 --> 00:03:42,470 -First we put our model in training mode (which -will activate the training behavior for some - -56 -00:03:42,470 --> 00:03:47,900 -layers like Dropout) then go through the number -of epochs we picked and all the data in our - -57 -00:03:47,900 --> 00:03:50,130 -training dataloader. - -58 -00:03:50,130 --> 00:03:54,560 -Then we go through all the steps we have seen -already: send the data to the GPU, compute - -59 -00:03:54,560 --> 00:03:57,870 -the model outputs, and in particular the loss. - -60 -00:03:57,870 --> 00:04:02,040 -Use the loss to compute gradients, then make -a training step with the optimizer. - -61 -00:04:02,040 --> 00:04:06,760 -Update the learning rate in our scheduler -for the next iteration and zero the gradients - -62 -00:04:06,760 --> 00:04:09,340 -of the optimizer. - -63 -00:04:09,340 --> 00:04:13,590 -Once this is finished, we can evaluate our -model very easily with a metric from the Datasets - -64 -00:04:13,590 --> 00:04:14,730 -library. - -65 -00:04:14,730 --> 00:04:22,470 -First we put our model in evaluation mode, -then go through all the data in the evaluation - -66 -00:04:22,470 --> 00:04:23,900 -data loader. - -67 -00:04:23,900 --> 00:04:27,480 -As we have seen in the Trainer video, the -model outputs logits and we need to apply - -68 -00:04:27,480 --> 00:04:31,350 -the argmax function to convert them into predictions. - -69 -00:04:31,350 --> 00:04:36,910 -The metric object then has an add_batch method -we can use to send it those intermediate predictions. - -70 -00:04:36,910 --> 00:04:40,590 -Once the evaluation loop is finished, we just -have to call the compute method to get our - -71 -00:04:40,590 --> 00:04:41,620 -final results! - -72 -00:04:41,620 --> 00:04:50,760 -Congratulations, you have now fine-tuned a -model all by yourself! +1 +00:00:00,298 --> 00:00:01,511 +(air whooshing) + +2 +00:00:01,511 --> 00:00:02,769 +(smiley face popping) + +3 +00:00:02,769 --> 00:00:05,460 +(air whooshing) + +4 +00:00:05,460 --> 00:00:08,486 +- Write your own training +loop with PyTorch. + +5 +00:00:08,486 --> 00:00:09,960 +In this video, we'll look at + +6 +00:00:09,960 --> 00:00:12,750 +how we can do the same fine-tuning +as in the Trainer video, + +7 +00:00:12,750 --> 00:00:14,760 +but without relying on that class. + +8 +00:00:14,760 --> 00:00:17,790 +This way, you'll be able to +easily customize each step + +9 +00:00:17,790 --> 00:00:20,310 +to the training loop to your needs. + +10 +00:00:20,310 --> 00:00:21,660 +This is also very useful + +11 +00:00:21,660 --> 00:00:22,740 +to manually debug something + +12 +00:00:22,740 --> 00:00:24,590 +that went wrong with the Trainer API. + +13 +00:00:26,220 --> 00:00:28,020 +Before we dive into the code, + +14 +00:00:28,020 --> 00:00:30,481 +here is a sketch of a training loop. + +15 +00:00:30,481 --> 00:00:33,381 +We take a batch of training +data and feed it to the model. + +16 +00:00:34,223 --> 00:00:36,960 +With the labels, we can +then compute a loss. + +17 +00:00:36,960 --> 00:00:39,316 +That number is not useful in its own, + +18 +00:00:39,316 --> 00:00:40,260 +that is used to compute + +19 +00:00:40,260 --> 00:00:42,150 +the ingredients of our model weights, + +20 +00:00:42,150 --> 00:00:43,440 +that is the derivative of the loss + +21 +00:00:44,610 --> 00:00:47,160 +with respect to each model weight. + +22 +00:00:47,160 --> 00:00:49,800 +Those gradients are then +used by the optimizer + +23 +00:00:49,800 --> 00:00:51,210 +to update the model weights, + +24 +00:00:51,210 --> 00:00:53,550 +and make them a little bit better. + +25 +00:00:53,550 --> 00:00:54,510 +We then repeat the process + +26 +00:00:54,510 --> 00:00:56,880 +with a new batch of training data. + +27 +00:00:56,880 --> 00:00:58,620 +If any of this isn't clear, + +28 +00:00:58,620 --> 00:01:00,270 +don't hesitate to take a refresher + +29 +00:01:00,270 --> 00:01:02,170 +on your favorite deep learning course. + +30 +00:01:03,210 --> 00:01:06,000 +We'll use the GLUE MRPC +data set here again, + +31 +00:01:06,000 --> 00:01:07,680 +and we've seen how to prepropose the data + +32 +00:01:07,680 --> 00:01:11,130 +using the Datasets library +with dynamic padding. + +33 +00:01:11,130 --> 00:01:12,630 +Check out the videos link below + +34 +00:01:12,630 --> 00:01:14,280 +if you haven't seen them already. + +35 +00:01:15,480 --> 00:01:18,930 +With this done, we only have +to define PyTorch DataLoaders + +36 +00:01:18,930 --> 00:01:20,610 +which will be responsible to convert + +37 +00:01:20,610 --> 00:01:23,253 +the elements of our dataset into patches. + +38 +00:01:24,450 --> 00:01:27,960 +We use our DataColletorForPadding +as a collate function, + +39 +00:01:27,960 --> 00:01:29,460 +and shuffle the training set + +40 +00:01:29,460 --> 00:01:31,080 +to make sure we don't go over the samples + +41 +00:01:31,080 --> 00:01:33,870 +in the same order at a epoch*. + +42 +00:01:33,870 --> 00:01:36,390 +To check that everything +works as intended, + +43 +00:01:36,390 --> 00:01:38,883 +we try to grab a batch +of data, and inspect it. + +44 +00:01:40,080 --> 00:01:43,050 +Like our data set elements, +it's a dictionary, + +45 +00:01:43,050 --> 00:01:46,260 +but these times the values are +not a single list of integers + +46 +00:01:46,260 --> 00:01:49,053 +but a tensor of shape batch +size by sequence length. + +47 +00:01:50,460 --> 00:01:53,580 +The next step is to send the +training data in our model. + +48 +00:01:53,580 --> 00:01:56,730 +For that, we'll need to +actually create a model. + +49 +00:01:56,730 --> 00:01:58,740 +As seen in the Model API video, + +50 +00:01:58,740 --> 00:02:00,540 +we use the from_pretrained method, + +51 +00:02:00,540 --> 00:02:03,270 +and adjust the number of +labels to the number of classes + +52 +00:02:03,270 --> 00:02:06,810 +we have on this data set, here two. + +53 +00:02:06,810 --> 00:02:08,940 +Again to be sure everything is going well, + +54 +00:02:08,940 --> 00:02:11,100 +we pass the batch we grabbed to our model, + +55 +00:02:11,100 --> 00:02:13,320 +and check there is no error. + +56 +00:02:13,320 --> 00:02:14,940 +If the labels are provided, + +57 +00:02:14,940 --> 00:02:16,590 +the models of the Transformers library + +58 +00:02:16,590 --> 00:02:18,273 +always returns a loss directly. + +59 +00:02:19,525 --> 00:02:21,090 +We will be able to do loss.backward() + +60 +00:02:21,090 --> 00:02:22,860 +to compute all the gradients, + +61 +00:02:22,860 --> 00:02:26,460 +and will then need an optimizer +to do the training step. + +62 +00:02:26,460 --> 00:02:28,860 +We use the AdamW optimizer here, + +63 +00:02:28,860 --> 00:02:31,440 +which is a variant of Adam +with proper weight decay, + +64 +00:02:31,440 --> 00:02:33,840 +but you can pick any +PyTorch optimizer you like. + +65 +00:02:34,830 --> 00:02:36,150 +Using the previous loss, + +66 +00:02:36,150 --> 00:02:39,060 +and computing the gradients +with loss.backward(), + +67 +00:02:39,060 --> 00:02:41,130 +we check that we can do the optimizer step + +68 +00:02:41,130 --> 00:02:42,030 +without any error. + +69 +00:02:43,380 --> 00:02:45,870 +Don't forget to zero +your gradient afterwards, + +70 +00:02:45,870 --> 00:02:46,890 +or at the next step, + +71 +00:02:46,890 --> 00:02:49,343 +they will get added to the +gradients you computed. + +72 +00:02:50,490 --> 00:02:52,080 +We could already write our training loop, + +73 +00:02:52,080 --> 00:02:53,220 +but we will add two more things + +74 +00:02:53,220 --> 00:02:55,620 +to make it as good as it can be. + +75 +00:02:55,620 --> 00:02:57,690 +The first one is a +learning rate scheduler, + +76 +00:02:57,690 --> 00:03:00,140 +to progressively decay +our learning rate to zero. + +77 +00:03:01,195 --> 00:03:04,590 +The get_scheduler function +from the Transformers library + +78 +00:03:04,590 --> 00:03:06,150 +is just a convenience function + +79 +00:03:06,150 --> 00:03:07,800 +to easily build such a scheduler. + +80 +00:03:08,850 --> 00:03:09,683 +You can again use + +81 +00:03:09,683 --> 00:03:11,860 +any PyTorch learning +rate scheduler instead. + +82 +00:03:13,110 --> 00:03:14,850 +Finally, if we want our training + +83 +00:03:14,850 --> 00:03:17,610 +to take a couple of minutes +instead of a few hours, + +84 +00:03:17,610 --> 00:03:19,530 +we will need to use a GPU. + +85 +00:03:19,530 --> 00:03:21,270 +The first step is to get one, + +86 +00:03:21,270 --> 00:03:23,283 +for instance by using a collab notebook. + +87 +00:03:24,180 --> 00:03:26,040 +Then you need to actually send your model, + +88 +00:03:26,040 --> 00:03:28,923 +and training data on it +by using a torch device. + +89 +00:03:29,790 --> 00:03:30,840 +Double-check the following lines + +90 +00:03:30,840 --> 00:03:32,340 +print a CUDA device for you. + +91 +00:03:32,340 --> 00:03:35,640 +or be prepared for your training +to less, more than an hour. + +92 +00:03:35,640 --> 00:03:37,390 +We can now put everything together. + +93 +00:03:38,550 --> 00:03:40,860 +First, we put our model in training mode + +94 +00:03:40,860 --> 00:03:42,240 +which will activate the training behavior + +95 +00:03:42,240 --> 00:03:44,790 +for some layers, like Dropout. + +96 +00:03:44,790 --> 00:03:46,860 +Then go through the number +of epochs we picked, + +97 +00:03:46,860 --> 00:03:50,070 +and all the data in our +training dataloader. + +98 +00:03:50,070 --> 00:03:52,410 +Then we go through all the +steps we have seen already; + +99 +00:03:52,410 --> 00:03:54,240 +send the data to the GPU, + +100 +00:03:54,240 --> 00:03:55,560 +compute the model outputs, + +101 +00:03:55,560 --> 00:03:57,720 +and in particular the loss. + +102 +00:03:57,720 --> 00:03:59,850 +Use the loss to compute gradients, + +103 +00:03:59,850 --> 00:04:02,880 +then make a training +step with the optimizer. + +104 +00:04:02,880 --> 00:04:04,500 +Update the learning rate in our scheduler + +105 +00:04:04,500 --> 00:04:05,970 +for the next iteration, + +106 +00:04:05,970 --> 00:04:07,763 +and zero the gradients of the optimizer. + +107 +00:04:09,240 --> 00:04:10,500 +Once this is finished, + +108 +00:04:10,500 --> 00:04:12,150 +we can evaluate our model very easily + +109 +00:04:12,150 --> 00:04:14,283 +with a metric from the Datasets library. + +110 +00:04:15,180 --> 00:04:17,880 +First, we put our model +in evaluation mode, + +111 +00:04:17,880 --> 00:04:20,550 +to deactivate layers like Dropout, + +112 +00:04:20,550 --> 00:04:23,850 +then go through all the data +in the evaluation data loader. + +113 +00:04:23,850 --> 00:04:25,530 +As we have seen in the Trainer video, + +114 +00:04:25,530 --> 00:04:26,850 +the model outputs logits, + +115 +00:04:26,850 --> 00:04:28,530 +and we need to apply the argmax function + +116 +00:04:28,530 --> 00:04:30,213 +to convert them into predictions. + +117 +00:04:31,350 --> 00:04:33,420 +The metric object then +has an add_batch method, + +118 +00:04:33,420 --> 00:04:36,810 +we can use to send it those +intermediate predictions. + +119 +00:04:36,810 --> 00:04:38,700 +Once the evaluation loop is finished, + +120 +00:04:38,700 --> 00:04:40,320 +we just have to call the compute method + +121 +00:04:40,320 --> 00:04:42,180 +to get our final results. + +122 +00:04:42,180 --> 00:04:44,490 +Congratulations, you have +now fine-tuned a model + +123 +00:04:44,490 --> 00:04:45,633 +all by yourself. + +124 +00:04:47,253 --> 00:04:49,920 +(air whooshing) + diff --git a/subtitles/en/30_supercharge-your-pytorch-training-loop-with-accelerate.srt b/subtitles/en/30_supercharge-your-pytorch-training-loop-with-accelerate.srt index 9f8b76dc3..35913784c 100644 --- a/subtitles/en/30_supercharge-your-pytorch-training-loop-with-accelerate.srt +++ b/subtitles/en/30_supercharge-your-pytorch-training-loop-with-accelerate.srt @@ -1,173 +1,322 @@ -1 -00:00:05,360 --> 00:00:08,800 -Supercharge your Pytorch training  -loop with Hugging Face Accelerate.   - -2 -00:00:11,120 --> 00:00:17,040 -There are multiple setups on which you can run  -your training: it could be on CPU, GPUs, TPUs.   - -3 -00:00:17,680 --> 00:00:22,640 -Distributed on one machine with several  -devices, or several machines (often called   - -4 -00:00:22,640 --> 00:00:29,360 -nodes) each with multiple devices. On top of that  -there are new tweaks to make your training faster   - -5 -00:00:29,360 --> 00:00:35,680 -or more memory efficient, like mixed precision  -and DeepSpeed. Each of those setups or training   - -6 -00:00:35,680 --> 00:00:40,080 -tweaks, requires you to change the code of  -your training loop in one way or another   - -7 -00:00:40,080 --> 00:00:46,480 -and to learn a new API. All those setups are  -handled by the Trainer API, and there are several   - -8 -00:00:46,480 --> 00:00:51,440 -third-party libraries that can also help you with  -that. The problem with those is that they can feel   - -9 -00:00:51,440 --> 00:00:56,320 -like a black box and that it might not be easy to  -implement the tweak to the training loop you need.   - -10 -00:00:57,680 --> 00:01:02,000 -Accelerate has been designed specifically to let  -you retain full control over your training loop   - -11 -00:01:02,560 --> 00:01:08,000 -and be as non-intrusive as possible. With  -just four lines to add to your training loop   - -12 -00:01:08,640 --> 00:01:11,840 -(here shown on the code of the training  -loop from the "Raw training loop" video),   - -13 -00:01:12,480 --> 00:01:16,800 -Accelerate will handle all the setups and  -training tweaks mentioned on the first slide.   - -14 -00:01:18,320 --> 00:01:21,600 -It's only one API to learn and  -master instead of ten different ones.   - -15 -00:01:23,120 --> 00:01:27,120 -More specifically, you have to import  -and instantiate an accelerator object,   - -16 -00:01:27,120 --> 00:01:30,000 -that will handle all the necessary  -code for your specific setup.   - -17 -00:01:31,200 --> 00:01:36,880 -Then you have to send it the model, optimizer and  -dataloaders you are using in the prepare method,   - -18 -00:01:37,760 --> 00:01:43,600 -which is the main method to remember. Accelerate  -handles device placement, so you don't need to put   - -19 -00:01:43,600 --> 00:01:49,840 -your batch on the specific device you are using.  -Finally, you have to replace the loss.backward   - -20 -00:01:49,840 --> 00:01:54,880 -line by accelerate.backward(loss),  -and that's all you need!   - -21 -00:01:58,240 --> 00:02:00,480 -Accelerate also handles distributed evaluation.   - -22 -00:02:01,440 --> 00:02:05,280 -You can still use a classic evaluation loop  -such as the one we saw in the "Raw training   - -23 -00:02:05,280 --> 00:02:09,760 -loop" video, in which case all processes  -will each perform the full evaluation.   - -24 -00:02:11,040 --> 00:02:15,360 -To use a distributed evaluation, you just  -have to adapt your evaluation loop like this:   - -25 -00:02:16,080 --> 00:02:19,920 -pass along the evaluation dataloader  -to the accelerator.prepare method,   - -26 -00:02:19,920 --> 00:02:25,200 -like for training. Then you can dismiss the  -line that places the batch on the proper device,   - -27 -00:02:25,920 --> 00:02:28,800 -and just before passing your  -predictions and labels to your metric,   - -28 -00:02:29,440 --> 00:02:36,160 -use accelerator.gather to gather together  -the predictions and labels from each process.   - -29 -00:02:36,160 --> 00:02:41,440 -A distributed training script has to be launched  -several times on different processes (for instance   - -30 -00:02:41,440 --> 00:02:47,360 -one per GPU you are using). You can use the  -PyTorch tools if you are familiar with them,   - -31 -00:02:48,000 --> 00:02:51,760 -but Accelerate also provides an  -easy API to configure your setup   - -32 -00:02:51,760 --> 00:02:58,000 -and launch your training script. In a terminal,  -run accelerate config and answer the small   - -33 -00:02:58,000 --> 00:03:01,680 -questionnaire to generate a configuration  -file with all the relevant information,   - -34 -00:03:03,120 --> 00:03:07,360 -then you can just run accelerate launch,  -followed by the path to your training script.   - -35 -00:03:08,400 --> 00:03:19,840 -In a notebook, you can use the notebook_launcher  -function to launch your training function. +1 +00:00:00,225 --> 00:00:02,892 +(air whooshing) + +2 +00:00:05,460 --> 00:00:07,470 +- Supercharge your PyTorch training loop + +3 +00:00:07,470 --> 00:00:08,943 +with Hugging Face Accelerate. + +4 +00:00:11,340 --> 00:00:12,600 +There are multiple setups + +5 +00:00:12,600 --> 00:00:14,580 +on which you can run your training: + +6 +00:00:14,580 --> 00:00:17,910 +it could be on CPU, GPUs, TPUs, + +7 +00:00:17,910 --> 00:00:20,610 +distributed on one machine +with several devices, + +8 +00:00:20,610 --> 00:00:23,220 +or even several machines, +often called nodes, + +9 +00:00:23,220 --> 00:00:25,173 +each with multiple devices. + +10 +00:00:26,340 --> 00:00:28,200 +On top of that, there are new tweaks + +11 +00:00:28,200 --> 00:00:30,810 +to make your training +faster or more efficient, + +12 +00:00:30,810 --> 00:00:32,763 +like mixed precision and DeepSpeed. + +13 +00:00:33,840 --> 00:00:36,600 +Each of those setups or training tweaks + +14 +00:00:36,600 --> 00:00:38,760 +requires you to change the +code of your training loop + +15 +00:00:38,760 --> 00:00:41,733 +in one way or another +and to learn a new API. + +16 +00:00:43,260 --> 00:00:45,940 +All those setups are +handled by the Trainer API, + +17 +00:00:45,940 --> 00:00:49,590 +and there are several third-party +libraries that can help. + +18 +00:00:49,590 --> 00:00:50,760 +The problem with those + +19 +00:00:50,760 --> 00:00:53,100 +is that they can feel like a black box + +20 +00:00:53,100 --> 00:00:55,320 +and that it might not be +easy to implement the tweak + +21 +00:00:55,320 --> 00:00:56,820 +to the training loop you need. + +22 +00:00:57,840 --> 00:00:59,760 +Accelerate has been designed specifically + +23 +00:00:59,760 --> 00:01:02,790 +to let you retain full control +over your training loop + +24 +00:01:02,790 --> 00:01:04,833 +and be as non-intrusive as possible. + +25 +00:01:05,760 --> 00:01:08,760 +With just four lines of code +to add to your training loop, + +26 +00:01:08,760 --> 00:01:11,733 +here shown on the example +of the training loop video, + +27 +00:01:12,630 --> 00:01:14,730 +Accelerate will handle all the setups + +28 +00:01:14,730 --> 00:01:17,180 +and training tweaks +mentioned on the first slide. + +29 +00:01:18,630 --> 00:01:20,400 +It's only one API to learn and master + +30 +00:01:20,400 --> 00:01:21,933 +instead of 10 different ones. + +31 +00:01:23,340 --> 00:01:25,980 +More specifically, you have +to import and instantiate + +32 +00:01:25,980 --> 00:01:27,360 +an accelerator object, + +33 +00:01:27,360 --> 00:01:29,100 +that will handle all the necessary code + +34 +00:01:29,100 --> 00:01:30,300 +for your specific setup. + +35 +00:01:31,380 --> 00:01:33,780 +Then you have to send it the model, + +36 +00:01:33,780 --> 00:01:36,000 +optimizer and dataloaders you are using + +37 +00:01:36,000 --> 00:01:39,633 +in the prepare method, which +is the main method to remember. + +38 +00:01:40,860 --> 00:01:42,870 +Accelerate handles device placement, + +39 +00:01:42,870 --> 00:01:44,370 +so you don't need to put your batch + +40 +00:01:44,370 --> 00:01:46,980 +on the specific device you are using. + +41 +00:01:46,980 --> 00:01:50,640 +Finally, you have to replace +the loss.backward line + +42 +00:01:50,640 --> 00:01:54,300 +by accelerator.backwardloss, + +43 +00:01:54,300 --> 00:01:55,500 +and that's all you need! + +44 +00:01:58,410 --> 00:02:01,710 +Accelerate also handles +distributed evaluation. + +45 +00:02:01,710 --> 00:02:04,020 +You can still use a +classic evaluation loop + +46 +00:02:04,020 --> 00:02:06,750 +such as the one we saw in +the training loop video, + +47 +00:02:06,750 --> 00:02:08,280 +in which case all processes + +48 +00:02:08,280 --> 00:02:10,083 +will perform the full evaluation. + +49 +00:02:11,340 --> 00:02:13,530 +To use a distributed evaluation, + +50 +00:02:13,530 --> 00:02:16,380 +you just have to adapt your +evaluation loop like this: + +51 +00:02:16,380 --> 00:02:17,657 +pass along the evaluation dataloader + +52 +00:02:17,657 --> 00:02:21,093 +to the accelerator.prepare +method, like for training. + +53 +00:02:22,170 --> 00:02:23,430 +Then you can dismiss the line + +54 +00:02:23,430 --> 00:02:26,160 +that places the batch +on the proper device, + +55 +00:02:26,160 --> 00:02:27,870 +and just before passing your predictions + +56 +00:02:27,870 --> 00:02:31,110 +and labels to your metric, +use accelerator.gather + +57 +00:02:31,110 --> 00:02:33,300 +to gather together the predictions + +58 +00:02:33,300 --> 00:02:34,803 +and labels from each process. + +59 +00:02:36,420 --> 00:02:37,890 +A distributed training script + +60 +00:02:37,890 --> 00:02:41,040 +has to be launched several +times on different processes, + +61 +00:02:41,040 --> 00:02:43,203 +for instance, one per GPU you are using. + +62 +00:02:44,070 --> 00:02:46,350 +You can use the PyTorch tools to do that + +63 +00:02:46,350 --> 00:02:48,210 +if you are familiar with them, + +64 +00:02:48,210 --> 00:02:50,520 +but Accelerate also provides an easy API + +65 +00:02:50,520 --> 00:02:53,523 +to configure your setup and +launch your training script. + +66 +00:02:54,540 --> 00:02:57,270 +In a terminal, run accelerate config + +67 +00:02:57,270 --> 00:02:58,650 +and answer the small questionnaire + +68 +00:02:58,650 --> 00:03:00,330 +to generate a configuration file + +69 +00:03:00,330 --> 00:03:02,073 +with all the relevant information, + +70 +00:03:03,240 --> 00:03:05,790 +then you can just run accelerate launch, + +71 +00:03:05,790 --> 00:03:08,580 +followed by the path to +your training script. + +72 +00:03:08,580 --> 00:03:12,000 +In a notebook, you can use +the notebook launcher function + +73 +00:03:12,000 --> 00:03:13,233 +to launch your training. + +74 +00:03:15,186 --> 00:03:17,853 +(air whooshing) + diff --git a/subtitles/en/31_navigating-the-model-hub.srt b/subtitles/en/31_navigating-the-model-hub.srt index c8e27a855..8facde57a 100644 --- a/subtitles/en/31_navigating-the-model-hub.srt +++ b/subtitles/en/31_navigating-the-model-hub.srt @@ -1,183 +1,343 @@ -1 -00:00:04,000 --> 00:00:07,760 -In this video, we're going to go over  -the HuggingFace Model Hub navigation.   - -2 -00:00:10,080 --> 00:00:16,160 -This is the huggingface.co landing page. To access  -the model hub, click on the "Models" tab in the   - -3 -00:00:16,160 --> 00:00:22,720 -upper right corner. You should be facing this web  -interface, which can be split into several parts.   - -4 -00:00:24,240 --> 00:00:28,560 -On the left, you'll find categories, which  -you can use to tailor your model search.   - -5 -00:00:29,760 --> 00:00:35,840 -The first category is the "Tasks". Models on  -the hub may be used for a wide variety of tasks.   - -6 -00:00:36,480 --> 00:00:41,440 -These include natural language processing tasks,  -such as question answering or text classification,   - -7 -00:00:41,440 --> 00:00:47,600 -but it isn't only limited to NLP. Other  -tasks from other fields are also available,   - -8 -00:00:47,600 --> 00:00:52,240 -such as image classification for computer vision,  -or automatic speech recognition for speech.   - -9 -00:00:54,720 --> 00:01:00,400 -The second category is the "libraries". Models  -on the hub usually share one of three backbones:   - -10 -00:01:01,040 --> 00:01:07,040 -PyTorch, TensorFlow, or JAX. However, other  -backbones, such as rust or ONNX also exist.   - -11 -00:01:09,440 --> 00:01:14,720 -Finally, this tab can also be used to specify  -from which high-level framework the model comes.   - -12 -00:01:15,920 --> 00:01:20,880 -This includes Transformers, but it isn't  -limited to it. The model Hub is used to host   - -13 -00:01:20,880 --> 00:01:25,840 -a lot of different frameworks' models, and we are  -actively looking to host other frameworks' models.   - -14 -00:01:28,400 --> 00:01:33,440 -The third category is the "Datasets"  -tab. Selecting a dataset from this tab   - -15 -00:01:33,440 --> 00:01:37,360 -means filtering the models so that they  -were trained on that specific dataset.   - -16 -00:01:39,040 --> 00:01:43,600 -The fourth category is the "Languages"  -tab. Selecting a language from this tab   - -17 -00:01:43,600 --> 00:01:46,800 -means filtering the models so that  -they handle the language selected.   - -18 -00:01:48,480 --> 00:01:53,840 -Finally, the last category allows to choose  -the license with which the model is shared.   - -19 -00:01:56,480 --> 00:01:59,440 -On the right, you'll find the  -models available on the model Hub!   - -20 -00:02:00,320 --> 00:02:06,400 -The models are ordered by downloads. When clicking  -on a model, you should be facing its model card.   - -21 -00:02:07,040 --> 00:02:11,520 -The model card contains information about  -the model: its description, intended use,   - -22 -00:02:11,520 --> 00:02:18,240 -limitations and biases. It can also show code  -snippets on how to use the model, as well as   - -23 -00:02:18,240 --> 00:02:23,840 -any relevant information: training procedure,  -data processing, evaluation results, copyrights.   - -24 -00:02:25,440 --> 00:02:30,160 -This information is crucial for the model to  -be used. The better crafted a model card is,   - -25 -00:02:30,160 --> 00:02:34,000 -the easier it will be for other users to  -leverage your model in their applications.   - -26 -00:02:35,600 --> 00:02:41,440 -On the right of the model card is the inference  -API. This inference API can be used to play with   - -27 -00:02:41,440 --> 00:02:46,640 -the model directly. Feel free to modify the text  -and click on compute to see how would the model   - -28 -00:02:46,640 --> 00:02:55,200 -behave to your inputs. At the top of the screen  -lie the model tags. These include the model task,   - -29 -00:02:55,200 --> 00:02:58,640 -as well as any other tag that is relevant  -to the categories we have just seen.   - -30 -00:03:01,200 --> 00:03:05,920 -The "Files & Versions tab" displays the  -architecture of the repository of that model.   - -31 -00:03:07,120 --> 00:03:12,080 -Here, we can see all the files that define  -this model. You'll see all usual features   - -32 -00:03:12,080 --> 00:03:22,320 -of a git repository: the branches available,  -the commit history as well as the commit diff.   - -33 -00:03:25,600 --> 00:03:28,800 -Three different buttons are available  -at the top of the model card.   - -34 -00:03:29,600 --> 00:03:32,800 -The first one shows how to use the  -inference API programmatically.   - -35 -00:03:35,760 --> 00:03:38,640 -The second one shows how to  -train this model in SageMaker,   - -36 -00:03:42,720 --> 00:03:45,840 -and the last one shows how to load that  -model within the appropriate library.   - -37 -00:03:46,720 --> 00:03:54,480 -For BERT, this is transformers. +1 +00:00:00,468 --> 00:00:03,051 +(upbeat music) + +2 +00:00:04,050 --> 00:00:05,910 +- [Instructor] In this +video, we're going to go over + +3 +00:00:05,910 --> 00:00:08,013 +the HuggingFace Model Hub navigation. + +4 +00:00:10,140 --> 00:00:13,260 +This is the huggingface.co landing page. + +5 +00:00:13,260 --> 00:00:16,020 +To access the model hub, +click on the models tab + +6 +00:00:16,020 --> 00:00:17,463 +in the upper right corner. + +7 +00:00:18,960 --> 00:00:21,030 +You should be facing this web interface, + +8 +00:00:21,030 --> 00:00:23,193 +which can be split into several parts. + +9 +00:00:24,480 --> 00:00:26,790 +On the left, you'll find categories, + +10 +00:00:26,790 --> 00:00:29,090 +which you can use to +tailor your model search. + +11 +00:00:29,970 --> 00:00:32,970 +The first category is the tasks. + +12 +00:00:32,970 --> 00:00:36,660 +Models on the hub may be used +for a wide variety of tasks. + +13 +00:00:36,660 --> 00:00:39,030 +These include natural +language processing tasks, + +14 +00:00:39,030 --> 00:00:41,670 +such as question answering +or text classification, + +15 +00:00:41,670 --> 00:00:43,773 +but it isn't only limited to NLP. + +16 +00:00:44,850 --> 00:00:47,790 +Other tasks from other +fields are also available, + +17 +00:00:47,790 --> 00:00:50,340 +such as image classification +for computer vision, + +18 +00:00:50,340 --> 00:00:52,683 +or automatic speech +recognition for speech. + +19 +00:00:54,840 --> 00:00:57,870 +The second category is the libraries. + +20 +00:00:57,870 --> 00:01:00,990 +Models on the hub usually +share one of three backbones, + +21 +00:01:00,990 --> 00:01:03,900 +PyTorch, TensorFlow, or JAX. + +22 +00:01:03,900 --> 00:01:07,503 +However, other backbones, such +as rust or ONNX also exist. + +23 +00:01:09,540 --> 00:01:11,850 +Finally, this tab can also be used + +24 +00:01:11,850 --> 00:01:15,123 +to specify from which high-level +framework the models comes. + +25 +00:01:16,140 --> 00:01:19,260 +This includes Transformers, +but it isn't limited to it. + +26 +00:01:19,260 --> 00:01:21,060 +The model hub is used to host + +27 +00:01:21,060 --> 00:01:22,920 +a lot of different frameworks models, + +28 +00:01:22,920 --> 00:01:24,600 +and we're actively looking to host + +29 +00:01:24,600 --> 00:01:25,893 +other frameworks models. + +30 +00:01:28,530 --> 00:01:31,890 +The third category is the datasets tab. + +31 +00:01:31,890 --> 00:01:35,070 +Selecting a dataset from this +tab means filtering the models + +32 +00:01:35,070 --> 00:01:37,683 +so that they were trained +on that specific dataset. + +33 +00:01:39,180 --> 00:01:42,300 +The fourth category is the languages tab. + +34 +00:01:42,300 --> 00:01:43,800 +Selecting a language from this tab + +35 +00:01:43,800 --> 00:01:45,990 +means filtering the +models so that they handle + +36 +00:01:45,990 --> 00:01:47,090 +the language selected. + +37 +00:01:48,600 --> 00:01:51,750 +Finally, the last category +allows to choose the license + +38 +00:01:51,750 --> 00:01:53,313 +with which the model is shared. + +39 +00:01:56,700 --> 00:01:58,770 +On the right, you'll +find the models available + +40 +00:01:58,770 --> 00:02:00,480 +on the model hub. + +41 +00:02:00,480 --> 00:02:03,750 +The models are ordered +by downloads by default. + +42 +00:02:03,750 --> 00:02:04,890 +When clicking on a model, + +43 +00:02:04,890 --> 00:02:07,230 +you should be facing its model card. + +44 +00:02:07,230 --> 00:02:09,990 +The model card contains +information about the model, + +45 +00:02:09,990 --> 00:02:13,263 +its description, intended +use, limitations and biases. + +46 +00:02:14,310 --> 00:02:17,580 +It can also show code snippets +on how to use the model, + +47 +00:02:17,580 --> 00:02:20,070 +as well as any relevant information; + +48 +00:02:20,070 --> 00:02:22,080 +training procedure, data processing, + +49 +00:02:22,080 --> 00:02:24,213 +evaluation results or copyrights. + +50 +00:02:25,710 --> 00:02:28,350 +This information is crucial +for the model to be used. + +51 +00:02:28,350 --> 00:02:30,360 +The better crafted a model card is, + +52 +00:02:30,360 --> 00:02:33,270 +the easier it will be for other +users to leverage your model + +53 +00:02:33,270 --> 00:02:34,443 +in their applications. + +54 +00:02:35,820 --> 00:02:38,553 +On the right of the model +card is the inference API. + +55 +00:02:39,540 --> 00:02:41,040 +This inference API can be used + +56 +00:02:41,040 --> 00:02:43,290 +to play with the model directly. + +57 +00:02:43,290 --> 00:02:45,690 +Feel free to modify the +text and click on compute + +58 +00:02:45,690 --> 00:02:48,140 +to see how would the model +behave to your inputs. + +59 +00:02:50,370 --> 00:02:53,013 +At the top of your screen +lies the model tags. + +60 +00:02:53,850 --> 00:02:56,550 +These include the model task, +as well as any other tag + +61 +00:02:56,550 --> 00:02:59,200 +that is relevant to the +categories we have just seen. + +62 +00:03:01,320 --> 00:03:04,410 +The files & versions tab +displays the architecture + +63 +00:03:04,410 --> 00:03:06,213 +of the repository of that model. + +64 +00:03:07,230 --> 00:03:10,920 +Here, we can see all the +files that define this model. + +65 +00:03:10,920 --> 00:03:13,650 +You'll see all usual +features of a Git repository: + +66 +00:03:13,650 --> 00:03:15,093 +the branches available, + +67 +00:03:17,160 --> 00:03:18,520 +the commit history + +68 +00:03:20,760 --> 00:03:22,683 +as well as the commit diff. + +69 +00:03:25,740 --> 00:03:27,510 +Three different buttons are available + +70 +00:03:27,510 --> 00:03:29,760 +at the top of the model card. + +71 +00:03:29,760 --> 00:03:31,170 +The first one shows how to use + +72 +00:03:31,170 --> 00:03:33,093 +the inference API programmatically. + +73 +00:03:35,910 --> 00:03:38,913 +The second one shows how to +train this model in SageMaker. + +74 +00:03:42,870 --> 00:03:44,820 +And the last one shows +how to load that model + +75 +00:03:44,820 --> 00:03:46,860 +within the appropriate library. + +76 +00:03:46,860 --> 00:03:48,783 +For BERT, this is transformers. + +77 +00:03:50,208 --> 00:03:52,791 +(upbeat music) + diff --git a/subtitles/en/32_managing-a-repo-on-the-model-hub.srt b/subtitles/en/32_managing-a-repo-on-the-model-hub.srt index 481275f93..d75814af0 100644 --- a/subtitles/en/32_managing-a-repo-on-the-model-hub.srt +++ b/subtitles/en/32_managing-a-repo-on-the-model-hub.srt @@ -1,346 +1,750 @@ -1 -00:00:02,560 --> 00:00:09,130 -In this video, we're going to understand how -to manage a model repository on the HuggingFace - -2 -00:00:09,130 --> 00:00:10,920 -model hub. - -3 -00:00:10,920 --> 00:00:15,370 -In order to handle a repository, you should -first have a Hugging Face account. - -4 -00:00:15,370 --> 00:00:20,310 -A link to create a new account is available -in the description. - -5 -00:00:20,310 --> 00:00:25,279 -Once you are logged in, you can create a new -repository by clicking on the "New model" - -6 -00:00:25,279 --> 00:00:26,279 -option. - -7 -00:00:26,279 --> 00:00:28,910 -You should be facing a similar modal to the -following. - -8 -00:00:28,910 --> 00:00:34,900 -In the "Owner" input, you can put either your -own namespace or any of your organisations - -9 -00:00:34,900 --> 00:00:36,719 -namespaces. - -10 -00:00:36,719 --> 00:00:41,739 -The model name is the model identifier that -will then be used to identify your model on - -11 -00:00:41,739 --> 00:00:44,250 -your chosen namespace. - -12 -00:00:44,250 --> 00:00:47,450 -The final choice is between public and private. - -13 -00:00:47,450 --> 00:00:50,100 -Public models are accessible by anyone. - -14 -00:00:50,100 --> 00:00:55,030 -This is the recommended, free option, as this -makes your model easily accessible and shareable. - -15 -00:00:55,030 --> 00:01:00,440 -The owners of your namespace are the only -ones who can update and change your model. - -16 -00:01:00,440 --> 00:01:03,210 -A more advanced option is the private option. - -17 -00:01:03,210 --> 00:01:08,460 -In this case, only the owners of your namespace -will have visibility over your model. - -18 -00:01:08,460 --> 00:01:15,010 -Other users won't know it exists and will -not be able to use it. - -19 -00:01:15,010 --> 00:01:18,320 -Let's create a dummy model to play with. - -20 -00:01:18,320 --> 00:01:22,360 -Once your model is created, comes the management -of that model! - -21 -00:01:22,360 --> 00:01:24,170 -Three tabs are available to you. - -22 -00:01:24,170 --> 00:01:29,070 -You're facing the first one, which is the -model card page; this is the page used to - -23 -00:01:29,070 --> 00:01:31,170 -showcase your model to the world. - -24 -00:01:31,170 --> 00:01:34,600 -We'll see how it can be completed in a bit. - -25 -00:01:34,600 --> 00:01:38,479 -The second one is the "Files & Versions". - -26 -00:01:38,479 --> 00:01:43,310 -Your model itself is a git repository - if -you're unaware of what is a git repository, - -27 -00:01:43,310 --> 00:01:46,439 -you can think of it as a folder containing -files. - -28 -00:01:46,439 --> 00:01:51,000 -If you have never used git before, we recommend -looking at an introduction like the one provided - -29 -00:01:51,000 --> 00:01:53,960 -in this video's description. - -30 -00:01:53,960 --> 00:01:59,020 -The git repository allows you to see the changes -happening over time in this folder, hence - -31 -00:01:59,020 --> 00:02:00,960 -the term "Versions". - -32 -00:02:00,960 --> 00:02:07,130 -We'll see how to add files and versions in -a bit. - -33 -00:02:07,130 --> 00:02:12,069 -The final tab is the "Settings" tab, which -allow you to manage your model's visibility - -34 -00:02:12,069 --> 00:02:14,780 -and availability. - -35 -00:02:14,780 --> 00:02:18,860 -Let's first start by adding files to the repository. - -36 -00:02:18,860 --> 00:02:23,459 -Files can be added through the web interface -thanks to the "Add File" button. - -37 -00:02:23,459 --> 00:02:28,849 -The added files can be of any type: python, -json, text, you name it! - -38 -00:02:28,849 --> 00:02:35,110 -Alongside your added file and its content, -you should name your change, or commit. - -39 -00:02:35,110 --> 00:02:42,670 -Generally, adding files is simpler when using -the command line. - -40 -00:02:42,670 --> 00:02:47,310 -We'll showcase how to do this using git. - -41 -00:02:47,310 --> 00:02:52,290 -In addition to git, we're using git-lfs, which -stands for large file storage in order to - -42 -00:02:52,290 --> 00:02:53,560 -manage large model files. - -43 -00:02:53,560 --> 00:03:00,980 -First, I make sure that both git and git-lfs -are correctly installed on my system. - -44 -00:03:00,980 --> 00:03:08,280 -Links to install git & git-lfs are provided -in the video description. - -45 -00:03:08,280 --> 00:03:12,470 -Then, we can get to work by cloning the repository -locally. - -46 -00:03:12,470 --> 00:03:14,990 -We have a repository with a single file! - -47 -00:03:14,990 --> 00:03:24,180 -The file that we have just added to the repository -using the web interface. - -48 -00:03:24,180 --> 00:03:45,549 -We can edit it to see the contents of this -file and update these. - -49 -00:03:45,549 --> 00:04:04,439 -It just turns out I have a model handy, that -can be used for sentiment analysis. - -50 -00:04:04,439 --> 00:04:10,790 -I'll simply copy over the contents to this -folder. - -51 -00:04:10,790 --> 00:04:20,030 -This includes the model weights, configuration -file and tokenizer to the repository. - -52 -00:04:20,030 --> 00:04:35,850 -I can then track these two files with the -git add command. - -53 -00:04:35,850 --> 00:04:40,639 -Then, I commit the changes. - -54 -00:04:40,639 --> 00:04:54,640 -I'm giving this commit the title of "Add model -weights and configuration". - -55 -00:04:54,640 --> 00:05:08,910 -Finally, I can push the new commit to the -huggingface.co remote. - -56 -00:05:08,910 --> 00:05:39,389 -When going back to the files & versions tab, -we can now see the newly added commit with - -57 -00:05:39,389 --> 00:05:41,090 -the updated files. - -58 -00:05:41,090 --> 00:05:59,250 -We have seen two ways of adding files to a -repository, a third way is explored in the - -59 -00:05:59,250 --> 00:06:07,310 -video about the push to hub API. - -60 -00:06:07,310 --> 00:06:25,099 -A link to this video is in - -61 -00:06:25,099 --> 00:06:45,470 -the description. - -62 -00:06:45,470 --> 00:06:50,229 -Go back to readme. - -63 -00:06:50,229 --> 00:06:57,510 -Unfortunately, the front page of our model -is still very empty. - -64 -00:06:57,510 --> 00:07:01,860 -Let's add a README markdown file to complete -it a little bit. - -65 -00:07:01,860 --> 00:07:06,770 -This README is known as the modelcard, and -it's arguably as important as the model and - -66 -00:07:06,770 --> 00:07:08,770 -tokenizer files in a model repository. - -67 -00:07:08,770 --> 00:07:15,990 -It is the central definition of the model, -ensuring reusability by fellow community members - -68 -00:07:15,990 --> 00:07:21,210 -and reproducibility of results, and providing -a platform on which other members may build - -69 -00:07:21,210 --> 00:07:22,510 -their artifacts. - -70 -00:07:22,510 --> 00:07:27,669 -We'll only add a title and a small description -here for simplicity's sake, but we encourage - -71 -00:07:27,669 --> 00:07:33,000 -you to add information relevant to how was -the model trained, its intended uses and limitations, - -72 -00:07:33,000 --> 00:07:39,190 -as well as its identified and potential biases, -evaluation results and code samples on how - -73 -00:07:39,190 --> 00:07:41,479 -your model should be used. - -74 -00:07:41,479 --> 00:07:44,220 -Great work contributing a model to the model -hub! - -75 -00:07:44,220 --> 00:07:53,110 -This model can now be used in downstream libraries -simply by specifying your model identifier. +1 +00:00:04,200 --> 00:00:06,210 +- [Instructor] In this video, +we're going to understand how + +2 +00:00:06,210 --> 00:00:08,280 +to manage a model repository + +3 +00:00:08,280 --> 00:00:10,053 +on the Hugging Face Hub Model Hub. + +4 +00:00:10,920 --> 00:00:13,020 +In order to handle a repository + +5 +00:00:13,020 --> 00:00:15,450 +you should first have +a Hugging Face account. + +6 +00:00:15,450 --> 00:00:17,610 +A link to create a new +account is available + +7 +00:00:17,610 --> 00:00:18,573 +in the description. + +8 +00:00:20,130 --> 00:00:22,980 +Once you are logged in, you +can create a new repository + +9 +00:00:22,980 --> 00:00:25,890 +by clicking on the new model option. + +10 +00:00:25,890 --> 00:00:29,400 +You should be facing a similar +modal to the following. + +11 +00:00:29,400 --> 00:00:33,240 +In the owner input, you can +put either your own namespace + +12 +00:00:33,240 --> 00:00:35,703 +or any of your organization's namespaces. + +13 +00:00:36,660 --> 00:00:39,330 +The model name is the model identifier + +14 +00:00:39,330 --> 00:00:40,320 +that will then be used + +15 +00:00:40,320 --> 00:00:43,143 +to identify your model +on the chosen namespace. + +16 +00:00:44,130 --> 00:00:47,700 +The final choice is +between public and private. + +17 +00:00:47,700 --> 00:00:49,950 +Public models are accessible by anyone. + +18 +00:00:49,950 --> 00:00:51,840 +This is the recommended free option, + +19 +00:00:51,840 --> 00:00:54,960 +as this makes your model easily +accessible and shareable. + +20 +00:00:54,960 --> 00:00:57,630 +The owners of your +namespace are the only ones + +21 +00:00:57,630 --> 00:00:59,523 +who can update and change your model. + +22 +00:01:00,450 --> 00:01:03,660 +A more advanced option +is the private option. + +23 +00:01:03,660 --> 00:01:04,560 +In this case, + +24 +00:01:04,560 --> 00:01:06,000 +only the owners of your namespace + +25 +00:01:06,000 --> 00:01:08,280 +will have visibility over your model. + +26 +00:01:08,280 --> 00:01:10,260 +Other users won't know it exists + +27 +00:01:10,260 --> 00:01:11,810 +and will not be able to use it. + +28 +00:01:15,030 --> 00:01:17,030 +Let's create a dummy model to play with. + +29 +00:01:18,180 --> 00:01:19,710 +Once your model is created, + +30 +00:01:19,710 --> 00:01:22,230 +comes the management of that model. + +31 +00:01:22,230 --> 00:01:24,360 +Three tabs are available to you. + +32 +00:01:24,360 --> 00:01:27,960 +You're facing the first one, +which is the model card page. + +33 +00:01:27,960 --> 00:01:29,970 +This is the page you use +to showcase your model + +34 +00:01:29,970 --> 00:01:31,110 +to the world. + +35 +00:01:31,110 --> 00:01:33,260 +We'll see how it can +be completed in a bit. + +36 +00:01:34,500 --> 00:01:37,503 +The second one is the +files and versions tab. + +37 +00:01:38,340 --> 00:01:40,920 +Your model itself is a Git repository. + +38 +00:01:40,920 --> 00:01:43,230 +If you're unaware of +what is a Git repository, + +39 +00:01:43,230 --> 00:01:46,320 +you can think of it as a +folder containing files. + +40 +00:01:46,320 --> 00:01:48,120 +If you have never used Git before, + +41 +00:01:48,120 --> 00:01:50,100 +we recommend looking at an introduction + +42 +00:01:50,100 --> 00:01:52,600 +like the one provided in +this video's description. + +43 +00:01:53,850 --> 00:01:56,910 +The Git repository allows you +to see the changes happening + +44 +00:01:56,910 --> 00:02:00,900 +over time in this folder, +hence the term versions. + +45 +00:02:00,900 --> 00:02:03,453 +We'll see how to add files +and versions in a bit. + +46 +00:02:07,020 --> 00:02:09,570 +The final tab is the settings tab, + +47 +00:02:09,570 --> 00:02:12,120 +which allows you to manage +your model's visibility + +48 +00:02:12,120 --> 00:02:13,203 +and availability. + +49 +00:02:14,790 --> 00:02:17,673 +Let's first start by adding +files to the repository. + +50 +00:02:18,540 --> 00:02:19,560 +Files can be added + +51 +00:02:19,560 --> 00:02:23,340 +through the web interface +thanks to the add file button. + +52 +00:02:23,340 --> 00:02:27,060 +The added files can be of +any type, python, JSON, text, + +53 +00:02:27,060 --> 00:02:27,893 +you name it. + +54 +00:02:28,740 --> 00:02:31,170 +Alongside your added file and its content, + +55 +00:02:31,170 --> 00:02:33,363 +you should name your change or commit. + +56 +00:02:36,330 --> 00:02:38,400 +Generally, adding files is simpler + +57 +00:02:38,400 --> 00:02:40,770 +by using the Hugging +Face Hub Python library + +58 +00:02:40,770 --> 00:02:43,050 +or by using the command-line. + +59 +00:02:43,050 --> 00:02:44,310 +We'll showcase how to do this + +60 +00:02:44,310 --> 00:02:46,290 +using the Hugging Face Hub Python library, + +61 +00:02:46,290 --> 00:02:48,060 +and there is a link in the description + +62 +00:02:48,060 --> 00:02:49,800 +to the previous version of this video, + +63 +00:02:49,800 --> 00:02:52,743 +showcasing how to do this +using Git and the command-line. + +64 +00:02:53,610 --> 00:02:54,840 +First, make sure you're logged + +65 +00:02:54,840 --> 00:02:56,460 +into your Hugging Face account, + +66 +00:02:56,460 --> 00:02:59,523 +either through the command-line +or in a Python runtime. + +67 +00:03:04,634 --> 00:03:06,390 +The first approach we'll take a look at + +68 +00:03:06,390 --> 00:03:08,880 +is using the upload file method. + +69 +00:03:08,880 --> 00:03:10,770 +This offers an extremely simple API + +70 +00:03:10,770 --> 00:03:12,630 +to upload files through the hub. + +71 +00:03:12,630 --> 00:03:14,190 +The three required parameters + +72 +00:03:14,190 --> 00:03:16,083 +are the current location of the file, + +73 +00:03:18,570 --> 00:03:21,300 +the path of that file in the repository, + +74 +00:03:21,300 --> 00:03:24,050 +and the idea of the repository +to which you're pushing. + +75 +00:03:25,650 --> 00:03:27,930 +There are a few additional parameters. + +76 +00:03:27,930 --> 00:03:29,100 +The token parameter, + +77 +00:03:29,100 --> 00:03:31,200 +if you would like to +specify a different token + +78 +00:03:31,200 --> 00:03:33,650 +than the one saved in your +cache with your login, + +79 +00:03:34,830 --> 00:03:36,750 +the repo type parameter, + +80 +00:03:36,750 --> 00:03:40,503 +if you would like to push +to a data set or a space. + +81 +00:03:42,300 --> 00:03:45,690 +We'll upload a file called +readme.md to the repository + +82 +00:03:45,690 --> 00:03:47,190 +using this method. + +83 +00:03:47,190 --> 00:03:49,710 +We first start by saving +a file with that name, + +84 +00:03:49,710 --> 00:03:51,210 +which contains some information + +85 +00:03:51,210 --> 00:03:52,920 +about the repository itself. + +86 +00:03:52,920 --> 00:03:54,243 +Here, a title. + +87 +00:03:55,950 --> 00:03:57,420 +Now that the file is saved, + +88 +00:03:57,420 --> 00:04:00,513 +let's use the upload file +method to upload it to the hub. + +89 +00:04:01,560 --> 00:04:03,540 +If we switch to the web +interface for a second + +90 +00:04:03,540 --> 00:04:07,080 +and refresh the page, we'll +see that the README is shown. + +91 +00:04:07,080 --> 00:04:08,883 +The file upload was a success. + +92 +00:04:10,170 --> 00:04:13,500 +Alongside this method +exists a delete file method + +93 +00:04:13,500 --> 00:04:16,170 +so that you may manage +your repository fully. + +94 +00:04:16,170 --> 00:04:18,820 +We'll use it to delete the +file we have just created. + +95 +00:04:22,860 --> 00:04:25,320 +If we refresh the page once again, good, + +96 +00:04:25,320 --> 00:04:26,973 +the file was indeed deleted. + +97 +00:04:29,070 --> 00:04:32,730 +This approach using only these +two methods is super simple. + +98 +00:04:32,730 --> 00:04:35,400 +It doesn't need Git or Git LFS installed, + +99 +00:04:35,400 --> 00:04:37,650 +but it does come with a limitation. + +100 +00:04:37,650 --> 00:04:39,600 +The maximum file size one can upload + +101 +00:04:39,600 --> 00:04:41,313 +is limited to five gigabytes. + +102 +00:04:42,360 --> 00:04:43,890 +To overcome this limit, + +103 +00:04:43,890 --> 00:04:45,540 +let's take a look at the second method + +104 +00:04:45,540 --> 00:04:47,643 +which is the repository utility. + +105 +00:04:48,600 --> 00:04:51,840 +This class is a wrapper over +Git and Git LFS methods, + +106 +00:04:51,840 --> 00:04:53,850 +which abstracts most of the complexity + +107 +00:04:53,850 --> 00:04:55,500 +and offers a flexible API + +108 +00:04:55,500 --> 00:04:57,990 +to manage your online repositories. + +109 +00:04:57,990 --> 00:04:59,690 +Let's take a look at how it works. + +110 +00:05:03,870 --> 00:05:08,369 +We first start by instantiating +the repository utility. + +111 +00:05:08,369 --> 00:05:10,380 +We provide the clone from parameter, + +112 +00:05:10,380 --> 00:05:13,383 +in order to clone the +repository we just created. + +113 +00:05:14,400 --> 00:05:18,750 +The repository is now +cloned in the local folder. + +114 +00:05:18,750 --> 00:05:22,200 +The repo object that we +have just initialized + +115 +00:05:22,200 --> 00:05:24,873 +offers quite a few methods +which are useful for us. + +116 +00:05:25,920 --> 00:05:28,800 +We're interested in +pushing a model to the hub. + +117 +00:05:28,800 --> 00:05:31,170 +I'll start by loading +a model and tokenizer + +118 +00:05:31,170 --> 00:05:32,643 +I trained a few hours ago. + +119 +00:05:34,380 --> 00:05:36,810 +We'll now follow the +traditional Git approach + +120 +00:05:36,810 --> 00:05:38,670 +by first pulling latest changes + +121 +00:05:38,670 --> 00:05:40,053 +using the Git pull method. + +122 +00:05:40,980 --> 00:05:43,170 +We just cloned the repository, + +123 +00:05:43,170 --> 00:05:45,780 +so unless this is a +super active repository, + +124 +00:05:45,780 --> 00:05:48,660 +it's unlikely that new +changes are available. + +125 +00:05:48,660 --> 00:05:51,000 +But it's always a good idea +to pull the latest changes + +126 +00:05:51,000 --> 00:05:52,300 +before doing anything new. + +127 +00:05:53,220 --> 00:05:55,200 +Now that we have pulled the repository, + +128 +00:05:55,200 --> 00:05:58,500 +I'll save the model and +tokenizer inside that folder. + +129 +00:05:58,500 --> 00:06:01,200 +This includes the model +weights, configuration file, + +130 +00:06:01,200 --> 00:06:02,673 +and tokenizer files. + +131 +00:06:04,440 --> 00:06:05,820 +Now that the model is saved, + +132 +00:06:05,820 --> 00:06:07,890 +we'll continue with the +traditional Git approach + +133 +00:06:07,890 --> 00:06:10,620 +and push it to the remote repository. + +134 +00:06:10,620 --> 00:06:12,150 +If we were using the command-line, + +135 +00:06:12,150 --> 00:06:14,250 +there are a few Git LFS specific commands + +136 +00:06:14,250 --> 00:06:15,600 +we would have to invoke. + +137 +00:06:15,600 --> 00:06:17,940 +But here, the Hugging Face hub package + +138 +00:06:17,940 --> 00:06:20,070 +takes care of all of that. + +139 +00:06:20,070 --> 00:06:24,420 +We'll start by staging the +files using the Git add method. + +140 +00:06:24,420 --> 00:06:27,600 +We'll then commit these changes +using Git commit method, + +141 +00:06:27,600 --> 00:06:30,690 +and providing a helpful commit message. + +142 +00:06:30,690 --> 00:06:33,210 +Finally, we'll push the +changes to the remote, + +143 +00:06:33,210 --> 00:06:34,953 +using the Git push method. + +144 +00:06:45,090 --> 00:06:47,430 +If we go back to the +files and versions tab, + +145 +00:06:47,430 --> 00:06:49,950 +we can now see the newly committed files. + +146 +00:06:49,950 --> 00:06:52,600 +We can even play with the +model in the inference API. + +147 +00:06:53,790 --> 00:06:55,770 +Unfortunately, the front page of our model + +148 +00:06:55,770 --> 00:06:57,540 +is still very empty. + +149 +00:06:57,540 --> 00:06:59,280 +Let's add a README markdown file + +150 +00:06:59,280 --> 00:07:00,753 +to complete it a little bit. + +151 +00:07:01,710 --> 00:07:04,200 +This README is known as the model card + +152 +00:07:04,200 --> 00:07:06,030 +and it's arguably as important + +153 +00:07:06,030 --> 00:07:09,330 +as the model and tokenizer +files in the model repository. + +154 +00:07:09,330 --> 00:07:11,280 +It is the central definition + +155 +00:07:11,280 --> 00:07:13,200 +and documentation of your model, + +156 +00:07:13,200 --> 00:07:16,440 +ensuring reusability by +fellow community members + +157 +00:07:16,440 --> 00:07:18,480 +and reproducibility of results. + +158 +00:07:18,480 --> 00:07:20,760 +Providing a platform +on which other members + +159 +00:07:20,760 --> 00:07:22,293 +may build their artifacts. + +160 +00:07:23,220 --> 00:07:25,590 +We'll only add a title and +a small description here + +161 +00:07:25,590 --> 00:07:27,060 +for simplicity's sake, + +162 +00:07:27,060 --> 00:07:29,370 +but we encourage you to +add information relevant + +163 +00:07:29,370 --> 00:07:30,990 +to how was the model trained, + +164 +00:07:30,990 --> 00:07:33,120 +it's intended use and limitations, + +165 +00:07:33,120 --> 00:07:36,180 +as well as it's identified +potential biases, + +166 +00:07:36,180 --> 00:07:37,440 +evaluation results, + +167 +00:07:37,440 --> 00:07:39,843 +and code samples on how to use your model. + +168 +00:07:41,460 --> 00:07:44,130 +Great work contributing +a model to the Model Hub. + +169 +00:07:44,130 --> 00:07:46,440 +This model can now be used +in downstream libraries + +170 +00:07:46,440 --> 00:07:48,783 +simply by specifying +your model identifier. + diff --git a/subtitles/en/33_the-push-to-hub-api-(pytorch).srt b/subtitles/en/33_the-push-to-hub-api-(pytorch).srt index f93d35d17..a2fcf8caf 100644 --- a/subtitles/en/33_the-push-to-hub-api-(pytorch).srt +++ b/subtitles/en/33_the-push-to-hub-api-(pytorch).srt @@ -1,244 +1,479 @@ -1 -00:00:05,130 --> 00:00:06,130 -The Push to Hub API. - -2 -00:00:06,130 --> 00:00:10,310 -Let's have a look at the push_to_hub API. - -3 -00:00:10,310 --> 00:00:16,209 -You will need to be logged in with your Hugging -Face account, which you can do by executing - -4 -00:00:16,209 --> 00:00:22,220 -this first cell or typing huggingface-cli -login in a terminal. - -5 -00:00:22,220 --> 00:00:27,480 -Just enter your username and password and -click login, which will store an authentication - -6 -00:00:27,480 --> 00:00:31,230 -token in the cache of the machine you're using. - -7 -00:00:31,230 --> 00:00:37,990 -Now, let's launch the fine-tuning of a BERT -model on the GLUE COLA dataset. - -8 -00:00:37,990 --> 00:00:41,900 -We won't go over the fine-tuning code because -you can find it in any Transformers tutorial, - -9 -00:00:41,900 --> 00:00:44,350 -or by looking at the videos linked below. - -10 -00:00:44,350 --> 00:00:49,920 -What interests us here, is how we can leverage -the Model Hub during training. - -11 -00:00:49,920 --> 00:00:56,500 -This is done with the push_to_hub=True passed -in your TrainingArguments . This will automatically - -12 -00:00:56,500 --> 00:01:02,149 -upload your model to the Hub each time it -is saved (so every epoch in our case), which - -13 -00:01:02,149 --> 00:01:08,260 -allows you to resume training from a different -machine if the current one gets interrupted. - -14 -00:01:08,260 --> 00:01:13,610 -The model will be uploaded in your namespace, -with the name of the output directory as a - -15 -00:01:13,610 --> 00:01:14,690 -repository name. - -16 -00:01:14,690 --> 00:01:20,580 -You can pick another name by passing it to -the hub_model_id argument, and you can also - -17 -00:01:20,580 --> 00:01:32,420 -push inside an organization you are a member -of by passing a full repository name. - -18 -00:01:32,420 --> 00:01:43,290 -With that done, we can just launch training -and wait a little bit. - -19 -00:01:43,290 --> 00:01:47,820 -Note that the model is pushed asynchronously, -meaning that the training continues while - -20 -00:01:47,820 --> 00:01:50,119 -your model is uploaded to the Hub. - -21 -00:01:50,119 --> 00:02:02,399 -When your first commit is finished, you can -go inspect your model on the Hub and even - -22 -00:02:02,399 --> 00:02:11,000 -start playing with its inference widget while -it's training! - -23 -00:02:11,000 --> 00:02:27,370 -There is something wrong with the labels, -but we will fix this later on in this video. - -24 -00:02:27,370 --> 00:02:33,590 -When the training is finished, we should do -one last push with trainer.push_to_hub for - -25 -00:02:33,590 --> 00:02:35,330 -two reasons. - -26 -00:02:35,330 --> 00:02:39,980 -One this will make sure we are uploading the -final version of our models if we didn't already - -27 -00:02:39,980 --> 00:02:45,860 -(for instance if we saved every n steps instead -of every epoch). - -28 -00:02:45,860 --> 00:02:51,310 -Two, this will draft a model card that will -be the landing page of your model repo. - -29 -00:02:51,310 --> 00:03:04,690 -Going back to the model page, you can see -the Trainer included some metadata that is - -30 -00:03:04,690 --> 00:03:15,350 -interpreted by the Hugging Face website in -the model card. - -31 -00:03:15,350 --> 00:03:20,120 -On top of informations about the training, -the intermediate results or the hyperparameter - -32 -00:03:20,120 --> 00:03:26,770 -used, we get the values of the metrics automatically -displayed in a small widget, and a link to - -33 -00:03:26,770 --> 00:03:28,860 -a leaderboard in Paper with Code. - -34 -00:03:28,860 --> 00:03:35,000 -The Tensorboard runs have also been pushed -to this repo, and we can look at them directly - -35 -00:03:35,000 --> 00:03:36,000 -from the Model Hub. - -36 -00:03:36,000 --> 00:03:43,709 -If you were not using the Trainer API to fine-tune -your model, you can use the push_to_hub method - -37 -00:03:43,709 --> 00:03:45,319 -on the model and tokenizer directly. - -38 -00:03:45,319 --> 00:03:49,340 -Let's test this to fix our labels in the inference -widget! - -39 -00:03:49,340 --> 00:03:54,140 -The inference widget was using default names -for labels because we did not indicate the - -40 -00:03:54,140 --> 00:03:57,100 -correspondence between integers and label -names. - -41 -00:03:57,100 --> 00:04:02,909 -We can fix in the configuration by setting -the label2id and id2label fields to their - -42 -00:04:02,909 --> 00:04:07,370 -proper value then we can push the fixed config -to our repo using the push_to_hub method. - -43 -00:04:07,370 --> 00:04:12,220 -Once this is done and we can check on the -website the model is now showing the proper - -44 -00:04:12,220 --> 00:04:13,440 -labels! - -45 -00:04:13,440 --> 00:04:21,280 -Now that the model is on the hub, we can use -it from anywhere with the from_pretrained - -46 -00:04:21,280 --> 00:04:22,370 -method. - -47 -00:04:22,370 --> 00:04:38,880 -We just have to use the identifier from the -hub and we can see that the model configuration - -48 -00:04:38,880 --> 00:04:39,880 -and weights are automatically downloaded. - -49 -00:04:39,880 --> 00:04:49,860 -We can use this model as we would any other -Transformers model, for instance by loading - -50 -00:04:49,860 --> 00:04:53,949 -it in a pipeline. - -51 -00:04:53,949 --> 00:04:57,550 -Try the push_to_hub API on your next training -to easily share your model with the rest of - -52 -00:04:57,550 --> 00:05:04,800 -the world! +1 +00:00:00,321 --> 00:00:01,497 +(air whooshing) + +2 +00:00:01,497 --> 00:00:02,330 +(smiley face popping) + +3 +00:00:02,330 --> 00:00:05,130 +(air whooshing) + +4 +00:00:05,130 --> 00:00:06,830 +- [Instructor] So push to hub API. + +5 +00:00:08,310 --> 00:00:10,533 +Let's have a look at the push to hub API. + +6 +00:00:11,730 --> 00:00:14,640 +You will need to be logged in +with your Hugging Face account + +7 +00:00:14,640 --> 00:00:17,400 +which you can do by +executing this first cell, + +8 +00:00:17,400 --> 00:00:21,123 +or by typing huggingface-cli +login in a terminal. + +9 +00:00:21,990 --> 00:00:26,640 +Just enter you username and +password, then click login, + +10 +00:00:26,640 --> 00:00:28,620 +this will store a notification token + +11 +00:00:28,620 --> 00:00:30,670 +in the cache of the machine you're using. + +12 +00:00:31,890 --> 00:00:35,790 +Now, let's launch a fine +tuning of a BERT model + +13 +00:00:35,790 --> 00:00:37,920 +on the GLUE COLA dataset. + +14 +00:00:37,920 --> 00:00:39,600 +We won't go over the fine tuning code + +15 +00:00:39,600 --> 00:00:42,270 +because you can find it in +any transformer tutorial, + +16 +00:00:42,270 --> 00:00:44,670 +or by looking at the videos link below. + +17 +00:00:44,670 --> 00:00:46,470 +What interests us here is + +18 +00:00:46,470 --> 00:00:48,970 +how we can leverage the +model hub during training. + +19 +00:00:49,860 --> 00:00:52,980 +This is done with the +"push_to_hub=true" argument + +20 +00:00:52,980 --> 00:00:55,530 +passed in your TrainingArguments. + +21 +00:00:55,530 --> 00:00:57,240 +This will automatically upload your model + +22 +00:00:57,240 --> 00:00:59,400 +to the Hub each time it is saved, + +23 +00:00:59,400 --> 00:01:01,323 +so every epoch in our case. + +24 +00:01:02,280 --> 00:01:04,860 +This allows you to resume +training from a different machine + +25 +00:01:04,860 --> 00:01:06,873 +if the current one gets interrupted. + +26 +00:01:08,220 --> 00:01:10,440 +The model will be updated +in your name space + +27 +00:01:10,440 --> 00:01:14,640 +with the name of the output +directory you picked by default. + +28 +00:01:14,640 --> 00:01:16,020 +You can choose another name + +29 +00:01:16,020 --> 00:01:19,113 +by passing it to the +hub_model_id argument. + +30 +00:01:20,070 --> 00:01:23,370 +You can also push inside an +organization you are a member of + +31 +00:01:23,370 --> 00:01:25,740 +by passing a full repository name, + +32 +00:01:25,740 --> 00:01:28,933 +with the name of the organization/, + +33 +00:01:28,933 --> 00:01:30,433 +the model ID you want to pick. + +34 +00:01:32,250 --> 00:01:34,650 +With that done, we can +just launch training, + +35 +00:01:34,650 --> 00:01:36,093 +and wait a little bit. + +36 +00:01:36,960 --> 00:01:39,033 +I'll cut the waiting time from the video. + +37 +00:01:43,260 --> 00:01:46,350 +Note that the model is +pushed asynchronously, + +38 +00:01:46,350 --> 00:01:47,730 +meaning that the training continues + +39 +00:01:47,730 --> 00:01:49,730 +while your model is uploaded to the hub. + +40 +00:01:51,060 --> 00:01:52,950 +When your first commit is finished, + +41 +00:01:52,950 --> 00:01:55,650 +you can go inspect your model on the Hub + +42 +00:01:55,650 --> 00:01:57,960 +by looking inside your name space, + +43 +00:01:57,960 --> 00:01:59,943 +and you'll find it at the very top. + +44 +00:02:01,980 --> 00:02:04,200 +You can even start playing +with its inference widget + +45 +00:02:04,200 --> 00:02:06,630 +while it's continuing the training. + +46 +00:02:06,630 --> 00:02:09,270 +The Cola data set tasks +the model with determining + +47 +00:02:09,270 --> 00:02:11,970 +if the sentence is +grammatically correct on that. + +48 +00:02:11,970 --> 00:02:15,510 +So we pick an example of +incorrect sentence to test it. + +49 +00:02:15,510 --> 00:02:16,950 +Note that it'll take a bit of time + +50 +00:02:16,950 --> 00:02:18,750 +to load your model inside +the inference APIs, + +51 +00:02:18,750 --> 00:02:20,880 +so first time you try to use it. + +52 +00:02:20,880 --> 00:02:23,280 +We'll cut by time from the video. + +53 +00:02:23,280 --> 00:02:24,870 +There is something wrong with the labels, + +54 +00:02:24,870 --> 00:02:27,360 +but we'll fix it later in this video. + +55 +00:02:27,360 --> 00:02:29,520 +Once your training is finished, + +56 +00:02:29,520 --> 00:02:31,770 +you should do one last +push with the trainer + +57 +00:02:31,770 --> 00:02:33,840 +that pushed to a method. + +58 +00:02:33,840 --> 00:02:35,430 +This is for two reason. + +59 +00:02:35,430 --> 00:02:36,750 +First, this will make sure + +60 +00:02:36,750 --> 00:02:39,180 +you are predicting the +final version of your model + +61 +00:02:39,180 --> 00:02:40,680 +if you didn't already. + +62 +00:02:40,680 --> 00:02:42,480 +For instance, if you used to save + +63 +00:02:42,480 --> 00:02:46,980 +every in step strategy +instead of every second, + +64 +00:02:46,980 --> 00:02:48,180 +this will draft a model card + +65 +00:02:48,180 --> 00:02:51,120 +that will be the landing +page of your model repo. + +66 +00:02:51,120 --> 00:02:52,260 +Once the commit is done, + +67 +00:02:52,260 --> 00:02:54,810 +let's go back on our +model page and refresh. + +68 +00:02:54,810 --> 00:02:56,820 +We can see the drafters model card + +69 +00:02:56,820 --> 00:02:58,080 +which includes information, + +70 +00:02:58,080 --> 00:03:00,381 +and which one model we find tuned. + +71 +00:03:00,381 --> 00:03:03,570 +So final evaluation loss and metric, + +72 +00:03:03,570 --> 00:03:06,300 +the training hyperparameter used, + +73 +00:03:06,300 --> 00:03:08,670 +the intermediate training results, + +74 +00:03:08,670 --> 00:03:10,320 +and the framework versions we used + +75 +00:03:10,320 --> 00:03:13,173 +so that other people can +easily reproduce our results. + +76 +00:03:15,270 --> 00:03:16,860 +On top of all that information, + +77 +00:03:16,860 --> 00:03:19,740 +the trainer also included some +metadata that is interpreted + +78 +00:03:19,740 --> 00:03:22,650 +by the Hugging Face +website in the model cloud. + +79 +00:03:22,650 --> 00:03:26,010 +You get the value of the metrics +reported in a nice widget + +80 +00:03:26,010 --> 00:03:29,640 +as well as a link to a +leaderboard with paper with code. + +81 +00:03:29,640 --> 00:03:32,550 +So the Tensorboard runs +have also been pushed + +82 +00:03:32,550 --> 00:03:34,560 +to this report, and we can look at them + +83 +00:03:34,560 --> 00:03:36,000 +directly from the model hub + +84 +00:03:36,000 --> 00:03:38,850 +by clicking on the +training metrics sub menu. + +85 +00:03:38,850 --> 00:03:39,795 +If you are not using the Trainer API + +86 +00:03:39,795 --> 00:03:42,510 +to fine-tune your model, + +87 +00:03:42,510 --> 00:03:43,770 +you can use a push_to_hub method + +88 +00:03:43,770 --> 00:03:46,427 +on the model, and tokenizer directly. + +89 +00:03:46,427 --> 00:03:50,160 +Let's test this to fix all +labels in the inference widget. + +90 +00:03:50,160 --> 00:03:52,740 +The inference widget was using +different names for labels + +91 +00:03:52,740 --> 00:03:54,810 +because we did not +indicate the correspondence + +92 +00:03:54,810 --> 00:03:57,030 +between integer and label names. + +93 +00:03:57,030 --> 00:03:58,740 +We can fix this in the configuration + +94 +00:03:58,740 --> 00:04:01,350 +by sitting the label2id, + +95 +00:04:01,350 --> 00:04:04,170 +and id2label fields +through the proper values + +96 +00:04:04,170 --> 00:04:06,933 +when pushing the model config to the hub. + +97 +00:04:07,950 --> 00:04:10,620 +Once this is done, we +can check on the website, + +98 +00:04:10,620 --> 00:04:13,380 +and the model is now +showing the proper label. + +99 +00:04:13,380 --> 00:04:15,240 +Now that the model is on the hub, + +100 +00:04:15,240 --> 00:04:17,370 +we can use it from anywhere + +101 +00:04:17,370 --> 00:04:19,920 +as we would any other Transformer model + +102 +00:04:19,920 --> 00:04:21,113 +with the from_pretrained method + +103 +00:04:21,113 --> 00:04:22,923 +of with the pipeline function. + +104 +00:04:34,350 --> 00:04:36,780 +We just have to use the +identifier from the hub, + +105 +00:04:36,780 --> 00:04:39,450 +and we can see that the model +configuration and weights + +106 +00:04:39,450 --> 00:04:42,483 +as well as the tokenized files +are automatically downloaded. + +107 +00:04:53,880 --> 00:04:55,950 +Try the push_to_hub API +in the next training + +108 +00:04:55,950 --> 00:04:58,650 +to easily share your model +with the rest of the world. + +109 +00:05:01,151 --> 00:05:03,818 +(air whooshing) + diff --git a/subtitles/en/34_the-push-to-hub-api-(tensorflow).srt b/subtitles/en/34_the-push-to-hub-api-(tensorflow).srt index 2f4a27605..d0f558ccc 100644 --- a/subtitles/en/34_the-push-to-hub-api-(tensorflow).srt +++ b/subtitles/en/34_the-push-to-hub-api-(tensorflow).srt @@ -1,434 +1,877 @@ -1 -00:00:05,040 --> 00:00:12,000 -Hi, this is going to be a video about the  -push_to_hub API for Tensorflow and Keras. So,   - -2 -00:00:12,000 --> 00:00:16,240 -to get started, we'll open up our notebook,  -and the first thing you'll need to do is   - -3 -00:00:16,240 --> 00:00:22,480 -log in to your HuggingFace account, for example  -with the notebook login function. So to do that,   - -4 -00:00:23,040 --> 00:00:28,560 -you simply call the function, the popup will  -emerge, you enter your username and password,   - -5 -00:00:28,560 --> 00:00:34,160 -which I'm going to pull out of my password  -manager here, and you're logged in. The next   - -6 -00:00:34,160 --> 00:00:38,720 -two cells are just getting everything ready for  -training. So we're just going to load a dataset,   - -7 -00:00:38,720 --> 00:00:42,960 -we're going to tokenize that dataset, and then  -we're going to load our model and compile it   - -8 -00:00:42,960 --> 00:00:47,040 -with the standard Adam optimizer. So  -I'm just going to run all of those,   - -9 -00:00:49,600 --> 00:00:53,760 -we'll wait a few seconds, and  -everything should be ready for training.   - -10 -00:00:57,600 --> 00:01:03,200 -Okay, so now we're ready to train I'm  -going to show you the two ways you can   - -11 -00:01:03,200 --> 00:01:07,520 -push your model to the Hub. So the  -first is with the PushToHubCallback.   - -12 -00:01:08,080 --> 00:01:14,640 -So a callback in Keras is a function that's called  -regularly during training. You can set it to be   - -13 -00:01:14,640 --> 00:01:20,640 -called after a certain number of steps, or every  -epoch, or even just once at the end of training.   - -14 -00:01:22,480 --> 00:01:27,600 -So a lot of callbacks in Keras, for example,  -control learning rate decaying on plateau   - -15 -00:01:28,320 --> 00:01:34,400 -and things like that. And so this callback, by  -default, will save your model to the Hub once   - -16 -00:01:34,400 --> 00:01:39,200 -every epoch. And that's really helpful especially  -if your training is very long, because that means   - -17 -00:01:39,200 --> 00:01:43,680 -you can resume from that save, so you get this  -automatic cloud-saving of your model, and you can   - -18 -00:01:43,680 --> 00:01:49,760 -even run inference with the checkpoints of your  -model that have been uploaded by this callback,   - -19 -00:01:50,720 --> 00:01:55,120 -and that means you can, y'know, actually  -run some test inputs and actually see how   - -20 -00:01:55,120 --> 00:02:00,560 -your model works at various stages during  -training, which is a really nice feature. So   - -21 -00:02:01,280 --> 00:02:06,560 -we're going to add the PushToHubCallback, and it  -takes just a few arguments. So the first argument   - -22 -00:02:06,560 --> 00:02:11,920 -is the temporary directory that files are going  -to be saved to before they're uploaded to the Hub.   - -23 -00:02:11,920 --> 00:02:16,880 -The second argument is the tokenizer, and the  -third argument here is the keyword argument   - -24 -00:02:17,600 --> 00:02:22,160 -hub_model_id. So that's the name it's going  -to be saved under on the HuggingFace Hub.   - -25 -00:02:23,200 --> 00:02:29,760 -You can also upload to an organization account  -just by adding the organization name before   - -26 -00:02:29,760 --> 00:02:34,320 -the repository name with a slash like this. So  -you probably don't have permissions to upload to   - -27 -00:02:34,320 --> 00:02:38,640 -the Hugging Face organization, if you do please  -file a bug and let us know extremely urgently.   - -28 -00:02:40,640 --> 00:02:44,000 -But if you do have access to your own  -organization then you can use that   - -29 -00:02:44,000 --> 00:02:47,600 -same approach to upload models to their  -account instead of to your own personal   - -30 -00:02:49,280 --> 00:02:56,080 -set of models. So, once you've made your  -callback you simply add it to the callbacks list   - -31 -00:02:56,080 --> 00:03:01,280 -when you're called model.fit() and everything is  -uploaded for you from there, and there's nothing   - -32 -00:03:01,280 --> 00:03:06,320 -else to worry about. The second way to upload a  -model, though, is to call model.push_to_hub().   - -33 -00:03:06,880 --> 00:03:11,920 -So this is more of a once-off method - it's not  -called regularly during training. You can just   - -34 -00:03:11,920 --> 00:03:17,680 -call this manually whenever you want to upload  -a model to the hub. So we recommend running this   - -35 -00:03:17,680 --> 00:03:22,720 -after the end of training, just to make sure that  -you have a commit message just to guarantee that   - -36 -00:03:22,720 --> 00:03:27,280 -this was the final version of the model at the  -end of training. And it just makes sure that   - -37 -00:03:28,160 --> 00:03:32,000 -you're working with the definitive end-of-training  -model and not accidentally using a model that's   - -38 -00:03:32,000 --> 00:03:36,720 -from a checkpoint somewhere along the way.  -So I'm going to run both of these cells   - -39 -00:03:38,800 --> 00:03:42,320 -and then I'm going to cut the video here, just  -because training is going to take a couple of   - -40 -00:03:42,320 --> 00:03:46,160 -minutes, and so I'll skip forward to the end of  -that, when the models have all been uploaded,   - -41 -00:03:46,160 --> 00:03:50,880 -and I'm gonna show you how you can access  -the models in the Hub and the other things   - -42 -00:03:50,880 --> 00:03:58,400 -you can do with them from there. Okay,  -we're back and our model was uploaded,   - -43 -00:03:58,960 --> 00:04:03,760 -both by the PushToHubCallback and also by our  -call to model.push_to_hub() after training.   - -44 -00:04:04,720 --> 00:04:10,320 -So everything's looking good! So now if we drop  -over to my profile on HuggingFace, and you can get   - -45 -00:04:10,320 --> 00:04:15,760 -there just by clicking the profile button in the  -dropdown, we can see that the bert-fine-tuned-cola   - -46 -00:04:15,760 --> 00:04:20,560 -model is here, and was updated 3 minutes ago. So  -it'll always be at the top of your list, because   - -47 -00:04:20,560 --> 00:04:25,280 -they're sorted by how recently they were updated.  -And we can start querying our model immediately!   - -48 -00:04:26,640 --> 00:04:36,720 -So the dataset we were training on is the Glue  -CoLA dataset, and CoLA is an acronym for Corpus   - -49 -00:04:36,720 --> 00:04:42,560 -of Linguistic Acceptability. So what that means  -is that the model is being trained to decide if a   - -50 -00:04:42,560 --> 00:04:49,040 -sentence is grammatically or linguistically okay,  -or if there's a problem with it. For example,   - -51 -00:04:49,680 --> 00:04:54,400 -we could say "This is a legitimate sentence"  -and hopefully it realizes that this is in   - -52 -00:04:54,400 --> 00:05:00,880 -fact a legitimate sentence. So it might take a  -couple of seconds for the model to load when you   - -53 -00:05:00,880 --> 00:05:05,200 -call it for the first time, so I might cut  -a couple of seconds out of this video here.   - -54 -00:05:07,680 --> 00:05:14,160 -Okay, we're back! The model loaded and we got  -an output, but there's an obvious problem here.   - -55 -00:05:14,160 --> 00:05:19,680 -So these labels aren't really telling us what  -categories the model has actually assigned to   - -56 -00:05:19,680 --> 00:05:26,720 -this input sentence. So if we want to fix that, we  -want to make sure the model config has the correct   - -57 -00:05:26,720 --> 00:05:31,920 -names for each of the label classes, and then we  -want to upload that config. So we can do that down   - -58 -00:05:31,920 --> 00:05:38,480 -here. To get the label_names, we can get that  -from the dataset we loaded, from the 'features'   - -59 -00:05:38,480 --> 00:05:44,160 -attribute it has. And then we can create  -dictionaries "id2label" and "label2id"   - -60 -00:05:45,200 --> 00:05:51,040 -and just assign them to the model config, and then  -we can just push our updated config and that'll   - -61 -00:05:51,040 --> 00:05:58,080 -override the existing config in the Hub repo. So  -that's just been done, so now if we go back here,   - -62 -00:05:58,080 --> 00:06:02,720 -I'm going to use a slightly different sentence  -because the outputs for sentences are sometimes   - -63 -00:06:02,720 --> 00:06:07,600 -cached, and so if we want to generate new results  -I'm going to use something slightly different. So   - -64 -00:06:07,600 --> 00:06:13,840 -let's try an incorrect sentence, so this is not  -valid English grammar and hopefully the model will   - -65 -00:06:13,840 --> 00:06:17,360 -see that. It's going to reload here, so  -I'm going to cut a couple of seconds here,   - -66 -00:06:18,480 --> 00:06:26,400 -and then we'll see what the model is going to say.  -Okay! So the model's confidence isn't very good,   - -67 -00:06:26,400 --> 00:06:31,440 -because of course we didn't really optimize our  -hyperparameters at all, but it has decided that   - -68 -00:06:31,440 --> 00:06:37,200 -this sentence is more likely to be unacceptable  -than acceptable. Presumably if we tried a bit   - -69 -00:06:37,200 --> 00:06:41,280 -harder with training we could get a much lower  -validation loss and therefore the model's   - -70 -00:06:41,280 --> 00:06:47,040 -predictions would be more precise. But let's  -try our original sentence again - of course,   - -71 -00:06:47,040 --> 00:06:52,560 -because of the caching issue we're seeing  -that the original answers are unchanged.   - -72 -00:06:52,560 --> 00:06:58,160 -So let's try a different, valid sentence. So  -let's try "This is a valid English sentence".   - -73 -00:06:59,920 --> 00:07:03,680 -And we see that now the model correctly decides  -that it has a very high probability of being   - -74 -00:07:03,680 --> 00:07:09,840 -acceptable and a very low probability of being  -unacceptable. So you can use this inference API   - -75 -00:07:09,840 --> 00:07:14,320 -even with the checkpoints that are uploaded during  -training, so it can be very interesting to see how   - -76 -00:07:15,200 --> 00:07:19,680 -the model's predictions for sample inputs  -change with each epoch of training.   - -77 -00:07:21,920 --> 00:07:27,040 -Also, the model we've uploaded is going to be  -accessible to you and, if it's shared publicly,   - -78 -00:07:27,040 --> 00:07:32,240 -to anyone else. So if you want to load that  -model all you, or anyone else, needs to do   - -79 -00:07:34,160 --> 00:07:40,640 -is just to load it in either a pipeline  -or you can just load it with, for example,   - -80 -00:07:40,640 --> 00:07:50,960 -TFAutoModelForSequenceClassification and then  -for the name you would just simply pass the path   - -81 -00:07:50,960 --> 00:07:58,560 -to the repo you want to upload - or to download,  -excuse me. So if I want to use this model again,   - -82 -00:07:58,560 --> 00:08:02,880 -if I want to load it from the hub, I just run this  -one line of code, the model will be downloaded   - -83 -00:08:05,280 --> 00:08:11,200 -and with any luck it'll be ready to  -fine-tune on a different dataset,   - -84 -00:08:11,200 --> 00:08:17,760 -make predictions with, or do anything else you  -wanna do. So that was a quick overview of how,   - -85 -00:08:17,760 --> 00:08:21,280 -after your training or during your  -training, you can upload models to the Hub,   - -86 -00:08:21,280 --> 00:08:26,800 -you can checkpoint there, you can resume training  -from there, and you can get inference results from   - -87 -00:08:26,800 --> 00:08:37,040 -the models you've uploaded. So thank you,  -and I hope to see you in a future video! +1 +00:00:00,587 --> 00:00:02,670 +(swoosh) + +2 +00:00:05,100 --> 00:00:07,080 +- [Narrator] Hi, this +is going to be a video + +3 +00:00:07,080 --> 00:00:09,420 +about the push_to_hub API + +4 +00:00:09,420 --> 00:00:10,670 +for Tensorflow and Keras. + +5 +00:00:11,820 --> 00:00:14,850 +So, to get started, we'll +open up our notebook. + +6 +00:00:14,850 --> 00:00:16,920 +And the first thing you'll +need to do is log in to + +7 +00:00:16,920 --> 00:00:18,170 +your HuggingFace account, + +8 +00:00:19,043 --> 00:00:20,663 +for example with the +notebook login function. + +9 +00:00:21,570 --> 00:00:24,630 +So to use that, you +simply call the function, + +10 +00:00:24,630 --> 00:00:26,010 +the popup will emerge. + +11 +00:00:26,010 --> 00:00:28,800 +You will enter your username and password, + +12 +00:00:28,800 --> 00:00:31,425 +which I'm going to pull out +of my password manager here, + +13 +00:00:31,425 --> 00:00:33,108 +and you log in. + +14 +00:00:33,108 --> 00:00:35,670 +The next two cells are just + +15 +00:00:35,670 --> 00:00:37,080 +getting everything ready for training. + +16 +00:00:37,080 --> 00:00:38,940 +So we're just going to load a dataset, + +17 +00:00:38,940 --> 00:00:41,100 +we're going to tokenize that dataset, + +18 +00:00:41,100 --> 00:00:42,990 +and then we're going to +load our model and compile + +19 +00:00:42,990 --> 00:00:45,660 +it with the standard Adam optimizer. + +20 +00:00:45,660 --> 00:00:47,560 +So I'm just going to run all of those. + +21 +00:00:49,830 --> 00:00:52,080 +We'll wait a few seconds, + +22 +00:00:52,080 --> 00:00:54,280 +and everything should +be ready for training. + +23 +00:00:57,983 --> 00:00:58,816 +Okay. + +24 +00:00:58,816 --> 00:01:01,440 +So now we're ready to train. + +25 +00:01:01,440 --> 00:01:03,030 +I'm going to show you the two ways + +26 +00:01:03,030 --> 00:01:05,130 +you can push your model to the Hub. + +27 +00:01:05,130 --> 00:01:08,190 +So the first is with +the PushToHubCallback. + +28 +00:01:08,190 --> 00:01:10,107 +So a callback in Keras + +29 +00:01:10,107 --> 00:01:13,710 +is a function that's called +regularly during training. + +30 +00:01:13,710 --> 00:01:17,400 +You can set it to be called +after a certain number of steps, + +31 +00:01:17,400 --> 00:01:21,427 +or every epoch, or even just +once at the end of training. + +32 +00:01:21,427 --> 00:01:25,080 +So a lot of callbacks +in Keras, for example, + +33 +00:01:25,080 --> 00:01:28,050 +control learning rate decaying on plateau, + +34 +00:01:28,050 --> 00:01:30,047 +and things like that. + +35 +00:01:30,047 --> 00:01:32,520 +So this callback, by default, + +36 +00:01:32,520 --> 00:01:35,760 +will save your model to +the Hub once every epoch. + +37 +00:01:35,760 --> 00:01:37,080 +And that's really helpful, + +38 +00:01:37,080 --> 00:01:38,790 +especially if your training is very long, + +39 +00:01:38,790 --> 00:01:40,800 +because that means you +can resume from that save, + +40 +00:01:40,800 --> 00:01:43,290 +so you get this automatic +cloud-saving of your model. + +41 +00:01:43,290 --> 00:01:45,027 +And you can even run inference + +42 +00:01:45,027 --> 00:01:47,730 +with the checkpoints of your model + +43 +00:01:47,730 --> 00:01:50,208 +that have been uploaded by this callback. + +44 +00:01:50,208 --> 00:01:52,260 +And that means you can, + +45 +00:01:52,260 --> 00:01:54,150 +y'know, run some test inputs + +46 +00:01:54,150 --> 00:01:56,100 +and actually see how your model works + +47 +00:01:56,100 --> 00:01:57,990 +at various stages during training, + +48 +00:01:57,990 --> 00:01:59,540 +which is a really nice feature. + +49 +00:02:00,390 --> 00:02:03,960 +So we're going to add +the PushToHubCallback, + +50 +00:02:03,960 --> 00:02:05,670 +and it takes just a few arguments. + +51 +00:02:05,670 --> 00:02:08,250 +So the first argument is +the temporary directory + +52 +00:02:08,250 --> 00:02:10,260 +that files are going to be saved to + +53 +00:02:10,260 --> 00:02:12,150 +before they're uploaded to the Hub. + +54 +00:02:12,150 --> 00:02:14,127 +The second argument is the tokenizer, + +55 +00:02:14,127 --> 00:02:15,808 +and the third argument here + +56 +00:02:15,808 --> 00:02:19,080 +is the keyword argument hub_model_id. + +57 +00:02:19,080 --> 00:02:21,330 +So that's the name it's +going to be saved under + +58 +00:02:21,330 --> 00:02:23,006 +on the HuggingFace Hub. + +59 +00:02:23,006 --> 00:02:26,267 +You can also upload to +an organization account + +60 +00:02:26,267 --> 00:02:29,370 +just by adding the organization name + +61 +00:02:29,370 --> 00:02:32,460 +before the repository name +with a slash, like this. + +62 +00:02:32,460 --> 00:02:34,020 +So you probably don't have permissions + +63 +00:02:34,020 --> 00:02:36,000 +to upload to the HuggingFace organization, + +64 +00:02:36,000 --> 00:02:37,170 +if you do please file a bug + +65 +00:02:37,170 --> 00:02:38,973 +and let us know extremely urgently. + +66 +00:02:40,830 --> 00:02:42,960 +But if you do have access +to your own organization, + +67 +00:02:42,960 --> 00:02:44,730 +then you can use that same approach + +68 +00:02:44,730 --> 00:02:46,650 +to upload models to their account + +69 +00:02:46,650 --> 00:02:50,760 +instead of to your own +personal set of models. + +70 +00:02:50,760 --> 00:02:53,520 +So, once you've made your callback, + +71 +00:02:53,520 --> 00:02:56,310 +you simply add it to the callbacks list + +72 +00:02:56,310 --> 00:02:58,080 +when you're calling model.fit. + +73 +00:02:58,080 --> 00:03:01,110 +And everything is uploaded +for you from there, + +74 +00:03:01,110 --> 00:03:02,610 +there's nothing else to worry about. + +75 +00:03:02,610 --> 00:03:04,530 +The second way to upload a model, though, + +76 +00:03:04,530 --> 00:03:07,020 +is to call model.push_to_hub. + +77 +00:03:07,020 --> 00:03:09,086 +So this is more of a once-off method. + +78 +00:03:09,086 --> 00:03:11,550 +It's not called regularly during training. + +79 +00:03:11,550 --> 00:03:13,680 +You can just call this +manually whenever you want to + +80 +00:03:13,680 --> 00:03:15,240 +upload a model to the hub. + +81 +00:03:15,240 --> 00:03:18,949 +So we recommend running this +after the end of training, + +82 +00:03:18,949 --> 00:03:21,870 +just to make sure that +you have a commit message + +83 +00:03:21,870 --> 00:03:24,060 +to guarantee that this +was the final version + +84 +00:03:24,060 --> 00:03:26,143 +of the model at the end of training. + +85 +00:03:26,143 --> 00:03:27,930 +And it just makes sure that, you know, + +86 +00:03:27,930 --> 00:03:30,480 +you're working with the +definitive end-of-training model + +87 +00:03:30,480 --> 00:03:32,190 +and not accidentally using a checkpoint + +88 +00:03:32,190 --> 00:03:34,224 +from somewhere along the way. + +89 +00:03:34,224 --> 00:03:37,173 +So I'm going to run both of these cells. + +90 +00:03:39,299 --> 00:03:41,716 +And then I'm going to cut the video here, + +91 +00:03:41,716 --> 00:03:43,080 +just because training is going +to take a couple of minutes. + +92 +00:03:43,080 --> 00:03:44,580 +So I'll skip forward to the end of that, + +93 +00:03:44,580 --> 00:03:46,320 +when the models have all been uploaded, + +94 +00:03:46,320 --> 00:03:48,390 +and I'm gonna show you how you can + +95 +00:03:48,390 --> 00:03:50,010 +access the models in the Hub, + +96 +00:03:50,010 --> 00:03:52,713 +and the other things you +can do with them from there. + +97 +00:03:55,440 --> 00:03:56,700 +Okay, we're back, + +98 +00:03:56,700 --> 00:03:59,160 +and our model was uploaded. + +99 +00:03:59,160 --> 00:04:00,750 +Both by the PushToHubCallback + +100 +00:04:00,750 --> 00:04:04,251 +and also by our call to +model.push_to_hub after training. + +101 +00:04:04,251 --> 00:04:05,910 +So everything's looking good. + +102 +00:04:05,910 --> 00:04:09,960 +So now if we drop over to +my profile on HuggingFace, + +103 +00:04:09,960 --> 00:04:12,630 +and you can get there just by +clicking the profile button + +104 +00:04:12,630 --> 00:04:13,680 +in the dropdown. + +105 +00:04:13,680 --> 00:04:16,860 +We can see that the +bert-fine-tuned-cola model is here, + +106 +00:04:16,860 --> 00:04:18,369 +and was updated 3 minutes ago. + +107 +00:04:18,369 --> 00:04:20,520 +So it'll always be at +the top of your list, + +108 +00:04:20,520 --> 00:04:23,340 +because they're sorted by how +recently they were updated. + +109 +00:04:23,340 --> 00:04:25,740 +And we can start querying +our model immediately. + +110 +00:04:30,564 --> 00:04:32,939 +So the dataset we were training on + +111 +00:04:32,939 --> 00:04:34,320 +is the Glue CoLA dataset, + +112 +00:04:34,320 --> 00:04:36,210 +and CoLA is an acronym standing for + +113 +00:04:36,210 --> 00:04:39,420 +the Corpus of Linguistic Acceptability. + +114 +00:04:39,420 --> 00:04:42,480 +So what that means is the model +is being trained to decide + +115 +00:04:42,480 --> 00:04:46,350 +if a sentence is grammatically +or linguistically okay, + +116 +00:04:46,350 --> 00:04:48,171 +or if there's a problem with it. + +117 +00:04:48,171 --> 00:04:52,890 +For example, we could say, +"This is a legitimate sentence." + +118 +00:04:52,890 --> 00:04:54,180 +And hopefully it realizes that + +119 +00:04:54,180 --> 00:04:56,080 +this is in fact a legitimate sentence. + +120 +00:04:57,630 --> 00:05:00,240 +So it might take a couple of +seconds for the model to load + +121 +00:05:00,240 --> 00:05:03,060 +when you call it for the first time. + +122 +00:05:03,060 --> 00:05:05,960 +So I might cut a couple of +seconds out of this video here. + +123 +00:05:07,860 --> 00:05:09,060 +Okay, we're back. + +124 +00:05:09,060 --> 00:05:12,407 +So the model loaded and we got an output, + +125 +00:05:12,407 --> 00:05:14,340 +but there's an obvious problem here. + +126 +00:05:14,340 --> 00:05:16,888 +So these labels aren't really telling us + +127 +00:05:16,888 --> 00:05:19,740 +what categories the model +has actually assigned + +128 +00:05:19,740 --> 00:05:21,655 +to this input sentence. + +129 +00:05:21,655 --> 00:05:23,520 +So if we want to fix that, + +130 +00:05:23,520 --> 00:05:26,010 +we want to make sure the model config + +131 +00:05:26,010 --> 00:05:28,980 +has the correct names for +each of the label classes, + +132 +00:05:28,980 --> 00:05:30,707 +and then we want to upload that config. + +133 +00:05:30,707 --> 00:05:32,220 +So we can do that down here. + +134 +00:05:32,220 --> 00:05:34,050 +To get the label names, + +135 +00:05:34,050 --> 00:05:36,547 +we can get that from +the dataset we loaded, + +136 +00:05:36,547 --> 00:05:39,627 +from the features attribute it has. + +137 +00:05:39,627 --> 00:05:42,217 +And then we can create dictionaries + +138 +00:05:42,217 --> 00:05:44,865 +"id2label" and "label2id", + +139 +00:05:44,865 --> 00:05:47,452 +and just assign them to the model config. + +140 +00:05:47,452 --> 00:05:50,790 +And then we can just +push our updated config, + +141 +00:05:50,790 --> 00:05:54,690 +and that'll override the +existing config in the Hub repo. + +142 +00:05:54,690 --> 00:05:56,368 +So that's just been done. + +143 +00:05:56,368 --> 00:05:58,320 +So now, if we go back here, + +144 +00:05:58,320 --> 00:06:00,000 +I'm going to use a +slightly different sentence + +145 +00:06:00,000 --> 00:06:03,540 +because the outputs for +sentences are sometimes cached. + +146 +00:06:03,540 --> 00:06:06,030 +And so, if we want to generate new results + +147 +00:06:06,030 --> 00:06:07,590 +I'm going to use something +slightly different. + +148 +00:06:07,590 --> 00:06:09,783 +So let's try an incorrect sentence. + +149 +00:06:10,830 --> 00:06:12,640 +So this is not valid English grammar + +150 +00:06:13,538 --> 00:06:15,030 +and hopefully the model will see that. + +151 +00:06:15,030 --> 00:06:16,958 +It's going to reload here, + +152 +00:06:16,958 --> 00:06:18,630 +so I'm going to cut a +couple of seconds here, + +153 +00:06:18,630 --> 00:06:20,933 +and then we'll see what +the model is going to say. + +154 +00:06:22,860 --> 00:06:23,820 +Okay. + +155 +00:06:23,820 --> 00:06:26,580 +So the model, it's +confidence isn't very good, + +156 +00:06:26,580 --> 00:06:28,830 +because of course we +didn't really optimize + +157 +00:06:28,830 --> 00:06:30,630 +our hyperparameters at all. + +158 +00:06:30,630 --> 00:06:32,190 +But it has decided that this sentence + +159 +00:06:32,190 --> 00:06:35,094 +is more likely to be +unacceptable than acceptable. + +160 +00:06:35,094 --> 00:06:38,160 +Presumably if we tried a +bit harder with training + +161 +00:06:38,160 --> 00:06:40,080 +we could get a much lower validation loss, + +162 +00:06:40,080 --> 00:06:43,830 +and therefore the model's +predictions would be more precise. + +163 +00:06:43,830 --> 00:06:46,260 +But let's try our original sentence again. + +164 +00:06:46,260 --> 00:06:49,140 +Of course, because of the caching issue, + +165 +00:06:49,140 --> 00:06:52,740 +we're seeing that the original +answers are unchanged. + +166 +00:06:52,740 --> 00:06:55,196 +So let's try a different, valid sentence. + +167 +00:06:55,196 --> 00:06:58,767 +So let's try, "This is a +valid English sentence". + +168 +00:07:00,150 --> 00:07:02,100 +And we see that now the +model correctly decides + +169 +00:07:02,100 --> 00:07:04,290 +that it has a very high +probability of being acceptable, + +170 +00:07:04,290 --> 00:07:06,900 +and a very low probability +of being unacceptable. + +171 +00:07:06,900 --> 00:07:09,930 +So you can use this inference API + +172 +00:07:09,930 --> 00:07:12,810 +even with the checkpoints that +are uploaded during training, + +173 +00:07:12,810 --> 00:07:14,546 +so it can be very interesting to see how + +174 +00:07:14,546 --> 00:07:17,690 +the model's predictions +for sample inputs change + +175 +00:07:17,690 --> 00:07:20,579 +with each epoch of training. + +176 +00:07:20,579 --> 00:07:23,370 +Also, the model we've uploaded + +177 +00:07:23,370 --> 00:07:25,740 +is going to be accessible to you and, + +178 +00:07:25,740 --> 00:07:28,046 +if it's shared publicly, to anyone else. + +179 +00:07:28,046 --> 00:07:29,788 +So if you want to load that model, + +180 +00:07:29,788 --> 00:07:32,500 +all you or anyone else needs to do + +181 +00:07:34,290 --> 00:07:37,440 +is just to load it in either a pipeline, + +182 +00:07:37,440 --> 00:07:40,925 +or you can just load it with, for example, + +183 +00:07:40,925 --> 00:07:43,203 +TFAutoModelForSequenceClassification. + +184 +00:07:46,920 --> 00:07:49,989 +And then for the name you +would just simply pass + +185 +00:07:49,989 --> 00:07:53,325 +the path to the repo you want to upload. + +186 +00:07:53,325 --> 00:07:55,890 +Or to download, excuse me. + +187 +00:07:55,890 --> 00:07:58,710 +So if I want to use this model again, + +188 +00:07:58,710 --> 00:08:00,667 +if I want to load it from the hub, + +189 +00:08:00,667 --> 00:08:01,763 +I just run this one line of code. + +190 +00:08:02,813 --> 00:08:03,773 +The model will be downloaded. + +191 +00:08:07,757 --> 00:08:10,080 +And, with any luck, it'll be ready to + +192 +00:08:10,080 --> 00:08:12,450 +fine-tune on a different +dataset, make predictions with, + +193 +00:08:12,450 --> 00:08:14,340 +or do anything else you wanna do. + +194 +00:08:14,340 --> 00:08:17,700 +So that was a quick overview of how, + +195 +00:08:17,700 --> 00:08:19,470 +after your training or +during your training, + +196 +00:08:19,470 --> 00:08:21,420 +you can upload models to the Hub, + +197 +00:08:21,420 --> 00:08:22,440 +you can checkpoint there, + +198 +00:08:22,440 --> 00:08:24,240 +you can resume training from there, + +199 +00:08:24,240 --> 00:08:26,790 +and you can get inference results + +200 +00:08:26,790 --> 00:08:28,384 +from the models you've uploaded. + +201 +00:08:28,384 --> 00:08:31,084 +So thank you, and I hope to +see you in a future video. + +202 +00:08:32,852 --> 00:08:34,935 +(swoosh) + diff --git a/subtitles/en/35_loading-a-custom-dataset.srt b/subtitles/en/35_loading-a-custom-dataset.srt index 0c31a2132..290b96431 100644 --- a/subtitles/en/35_loading-a-custom-dataset.srt +++ b/subtitles/en/35_loading-a-custom-dataset.srt @@ -1,129 +1,343 @@ -1 -00:00:06,080 --> 00:00:11,600 -Loading a custom dataset. Although the Hugging  -Face Hub hosts over a thousand public datasets,   - -2 -00:00:11,600 --> 00:00:15,040 -you'll often need to work with data that is  -stored on your laptop or some remote server.   - -3 -00:00:15,760 --> 00:00:19,520 -In this video we'll explore how the Datasets  -library can be used to load datasets that   - -4 -00:00:19,520 --> 00:00:24,800 -aren’t available on the Hugging Face Hub.  -As you can see in this table, the Datasets   - -5 -00:00:24,800 --> 00:00:30,080 -library provides several in-built scripts to load  -datasets in several formats. To load a dataset in   - -6 -00:00:30,080 --> 00:00:34,160 -one of these formats, you just need to provide the  -name of the format to the load_dataset function,   - -7 -00:00:34,160 --> 00:00:38,000 -along with a data_files argument that  -points to one or more filepaths or URLs.   - -8 -00:00:40,080 --> 00:00:44,400 -To see this in action, let's start by  -loading a local CSV file. In this example,   - -9 -00:00:44,400 --> 00:00:48,720 -we first download a dataset about wine quality  -from the UCI machine learning repository.   - -10 -00:00:50,080 --> 00:00:56,000 -Since this is a CSV file, we then specify the  -csv loading script. This script needs to know   - -11 -00:00:56,000 --> 00:01:00,160 -where our data is located, so we provide the  -filename as part of the data_files argument.   - -12 -00:01:01,920 --> 00:01:05,760 -The CSV loading script also allows you to pass  -several keyword arguments, so here we've also   - -13 -00:01:05,760 --> 00:01:10,640 -specified the separator as a semi-colon. And  -with that we can see the dataset is loaded   - -14 -00:01:10,640 --> 00:01:15,360 -automatically as a DatasetDict object, with each  -column in the CSV file represented as a feature.   - -15 -00:01:17,360 --> 00:01:21,760 -If your dataset is located on some remote  -server like GitHub or some other repository,   - -16 -00:01:21,760 --> 00:01:26,320 -the process is very similar. The only difference  -is that now the data_files argument points to a   - -17 -00:01:26,320 --> 00:01:33,600 -URL instead of a local filepath. Let's now take  -a look at loading raw text files. This format   - -18 -00:01:33,600 --> 00:01:37,840 -is quite common in NLP and you'll typically  -find books and plays are just a single file   - -19 -00:01:37,840 --> 00:01:43,040 -with raw text inside. In this example, we  -have a text file of Shakespeare plays that's   - -20 -00:01:43,040 --> 00:01:48,880 -stored on a GitHub repository. As we did for CSV  -files, we simply choose the text loading script   - -21 -00:01:48,880 --> 00:01:54,080 -and point the data_files argument to the URL.  -As you can see, these files are processed   - -22 -00:01:54,080 --> 00:01:58,640 -line-by-line, so empty lines in the raw text  -are also represented as a row in the dataset.   - -23 -00:02:00,560 --> 00:02:05,840 -For JSON files, there are two main formats to  -know about. The first one is called JSON Lines,   - -24 -00:02:05,840 --> 00:02:10,880 -where every row in the file is a separate JSON  -object. For these files, you can load the dataset   - -25 -00:02:10,880 --> 00:02:15,760 -by selecting the json loading script and pointing  -the data_files argument to the file or URL.   - -26 -00:02:16,960 --> 00:02:21,840 -In this example, we've loaded a JSON lines files  -based on Stack Exchange questions and answers. +1 +00:00:00,195 --> 00:00:01,426 +(screen whooshing) + +2 +00:00:01,426 --> 00:00:02,614 +(sticker popping) + +3 +00:00:02,614 --> 00:00:06,150 +(screen whooshing) + +4 +00:00:06,150 --> 00:00:08,430 +- Loading a custom dataset. + +5 +00:00:08,430 --> 00:00:09,750 +Although the Hugging Face Hub hosts + +6 +00:00:09,750 --> 00:00:11,730 +over a thousand public datasets, + +7 +00:00:11,730 --> 00:00:12,930 +you'll often need to work with data + +8 +00:00:12,930 --> 00:00:15,900 +that is stored on your +laptop or some remote server. + +9 +00:00:15,900 --> 00:00:18,060 +In this video, we'll explore +how the Datasets library + +10 +00:00:18,060 --> 00:00:20,310 +can be used to load datasets +that aren't available + +11 +00:00:20,310 --> 00:00:21,510 +on the Hugging Face Hub. + +12 +00:00:22,980 --> 00:00:25,290 +As you can see in this +table, the Datasets library + +13 +00:00:25,290 --> 00:00:26,700 +provides several in-built scripts + +14 +00:00:26,700 --> 00:00:29,370 +to load datasets in several formats. + +15 +00:00:29,370 --> 00:00:31,200 +To load a dataset in one of these formats, + +16 +00:00:31,200 --> 00:00:32,730 +you just need to provide +the name of the format + +17 +00:00:32,730 --> 00:00:34,350 +to the load_dataset function, + +18 +00:00:34,350 --> 00:00:35,790 +along with a data_files argument + +19 +00:00:35,790 --> 00:00:37,610 +that points to one or +more filepaths or URLs. + +20 +00:00:40,350 --> 00:00:43,590 +To see this in action, let's +start by loading a CSV file. + +21 +00:00:43,590 --> 00:00:45,960 +In this example, we +first download a dataset + +22 +00:00:45,960 --> 00:00:48,963 +about wine quality from the UCI +machine learning repository. + +23 +00:00:50,220 --> 00:00:52,590 +Since this is a CSV file, we then specify + +24 +00:00:52,590 --> 00:00:53,943 +the CSV loading script. + +25 +00:00:55,320 --> 00:00:57,570 +Now, this script needs to know +where our data is located, + +26 +00:00:57,570 --> 00:00:58,650 +so we provide the filename + +27 +00:00:58,650 --> 00:01:00,483 +as part of the data_files argument. + +28 +00:01:01,860 --> 00:01:03,360 +And the loading script also allows you + +29 +00:01:03,360 --> 00:01:05,040 +to pass several keyword arguments, + +30 +00:01:05,040 --> 00:01:06,750 +so here we've also specified + +31 +00:01:06,750 --> 00:01:09,030 +that the separator is a semi-colon. + +32 +00:01:09,030 --> 00:01:10,380 +And with that, we can see the dataset + +33 +00:01:10,380 --> 00:01:13,020 +is loaded automatically +as a DatasetDict object, + +34 +00:01:13,020 --> 00:01:15,920 +with each column in the CSV +file represented as a feature. + +35 +00:01:17,610 --> 00:01:20,280 +If your dataset is located on +some remote server like GitHub + +36 +00:01:20,280 --> 00:01:22,050 +or some other repository, + +37 +00:01:22,050 --> 00:01:23,700 +the process is actually very similar. + +38 +00:01:23,700 --> 00:01:25,980 +The only difference is that +now the data_files argument + +39 +00:01:25,980 --> 00:01:28,623 +points to a URL instead +of a local filepath. + +40 +00:01:30,330 --> 00:01:33,270 +Let's now take a look at +loading raw text files. + +41 +00:01:33,270 --> 00:01:35,100 +This format is quite common in NLP, + +42 +00:01:35,100 --> 00:01:36,750 +and you'll typically find books and plays + +43 +00:01:36,750 --> 00:01:39,393 +are just a single file +with raw text inside. + +44 +00:01:40,410 --> 00:01:43,020 +In this example, we have a +text file of Shakespeare plays + +45 +00:01:43,020 --> 00:01:45,330 +that's stored on a GitHub repository. + +46 +00:01:45,330 --> 00:01:47,040 +And as we did for CSV files, + +47 +00:01:47,040 --> 00:01:49,020 +we simply choose the text loading script + +48 +00:01:49,020 --> 00:01:51,423 +and point the data_files +argument to the URL. + +49 +00:01:52,260 --> 00:01:55,110 +As you can see, these files +are processed line-by-line, + +50 +00:01:55,110 --> 00:01:57,690 +so empty lines in the raw +text are also represented + +51 +00:01:57,690 --> 00:01:58,953 +as a row in the dataset. + +52 +00:02:00,810 --> 00:02:04,230 +For JSON files, there are two +main formats to know about. + +53 +00:02:04,230 --> 00:02:06,060 +The first one is called JSON Lines, + +54 +00:02:06,060 --> 00:02:09,510 +where every row in the file +is a separate JSON object. + +55 +00:02:09,510 --> 00:02:11,100 +For these files, you can load the dataset + +56 +00:02:11,100 --> 00:02:13,020 +by selecting the JSON loading script + +57 +00:02:13,020 --> 00:02:16,143 +and pointing the data_files +argument to the file or URL. + +58 +00:02:17,160 --> 00:02:19,410 +In this example, we've +loaded a JSON lines files + +59 +00:02:19,410 --> 00:02:21,710 +based on Stack Exchange +questions and answers. + +60 +00:02:23,490 --> 00:02:26,610 +The other format is nested JSON files. + +61 +00:02:26,610 --> 00:02:29,100 +These files basically look +like one huge dictionary, + +62 +00:02:29,100 --> 00:02:31,200 +so the load_dataset function +allow you to specify + +63 +00:02:31,200 --> 00:02:32,733 +which specific key to load. + +64 +00:02:33,630 --> 00:02:35,910 +For example, the SQuAD dataset +for question and answering + +65 +00:02:35,910 --> 00:02:38,340 +has its format, and we +can load it by specifying + +66 +00:02:38,340 --> 00:02:40,340 +that we're interested in the data field. + +67 +00:02:41,400 --> 00:02:42,780 +There is just one last thing to mention + +68 +00:02:42,780 --> 00:02:44,910 +about all of these loading scripts. + +69 +00:02:44,910 --> 00:02:46,410 +You can have more than one split, + +70 +00:02:46,410 --> 00:02:49,080 +you can load them by treating +data files as a dictionary, + +71 +00:02:49,080 --> 00:02:52,140 +and map each split name +to its corresponding file. + +72 +00:02:52,140 --> 00:02:53,970 +Everything else stays completely unchanged + +73 +00:02:53,970 --> 00:02:55,350 +and you can see an example of loading + +74 +00:02:55,350 --> 00:02:58,283 +both the training and validation +splits for this SQuAD here. + +75 +00:02:59,550 --> 00:03:02,310 +And with that, you can now +load datasets from your laptop, + +76 +00:03:02,310 --> 00:03:04,653 +the Hugging Face Hub, +or anywhere else want. + +77 +00:03:06,277 --> 00:03:09,194 +(screen whooshing) + diff --git "a/subtitles/en/36_slice-and-dice-a-dataset-\360\237\224\252.srt" "b/subtitles/en/36_slice-and-dice-a-dataset-\360\237\224\252.srt" index f899b80e1..dd49935e2 100644 --- "a/subtitles/en/36_slice-and-dice-a-dataset-\360\237\224\252.srt" +++ "b/subtitles/en/36_slice-and-dice-a-dataset-\360\237\224\252.srt" @@ -1,203 +1,370 @@ -1 -00:00:05,680 --> 00:00:07,440 -How to slice and dice a dataset.   - -2 -00:00:08,640 --> 00:00:12,320 -Most of the time, the data you work with won’t  -be perfectly prepared for training models.   - -3 -00:00:13,120 --> 00:00:17,920 -In this video we’ll explore various features  -that Datasets provides to clean up your datasets.   - -4 -00:00:19,760 --> 00:00:23,520 -The Datasets library provides several built-in  -methods that allow you to wrangle your data.   - -5 -00:00:25,200 --> 00:00:29,360 -In this video we'll see how you can shuffle  -and split your data, select the rows you're   - -6 -00:00:29,360 --> 00:00:33,840 -interested in, tweak the columns, and apply  -processing functions with the map() method.   - -7 -00:00:35,440 --> 00:00:39,920 -Let's start with shuffling. It is generally a  -good idea to apply shuffling to the training set   - -8 -00:00:39,920 --> 00:00:42,640 -so that your model doesn't learn  -any artificial ordering in the data.   - -9 -00:00:43,360 --> 00:00:46,880 -If you want to shuffle the whole dataset, you  -can apply the appropriately named shuffle()   - -10 -00:00:46,880 --> 00:00:51,280 -method to your dataset. You can see an example of  -this method in action here, where we've downloaded   - -11 -00:00:51,280 --> 00:00:56,960 -the training split of the SQUAD dataset  -and shuffled all the rows randomly.Another   - -12 -00:00:56,960 --> 00:01:00,000 -way to shuffle the data is to  -create random train and test splits.   - -13 -00:01:00,720 --> 00:01:05,600 -This can be useful if you have to create your own  -test splits from raw data. To do this, you just   - -14 -00:01:05,600 --> 00:01:11,760 -apply the train_test_split method and specify how  -large the test split should be. In this example,   - -15 -00:01:11,760 --> 00:01:17,280 -we've specified that the test set should be  -10% of the total dataset size. You can see that   - -16 -00:01:17,280 --> 00:01:22,400 -the output of train_test_split is a DatasetDict  -object, whose keys correspond to the new splits.   - -17 -00:01:24,960 --> 00:01:28,400 -Now that we know how to shuffle a dataset,  -let's take a look at returning the rows   - -18 -00:01:28,400 --> 00:01:32,080 -we're interested in. The most common way  -to do this is with the select method.   - -19 -00:01:32,960 --> 00:01:36,560 -This method expects a list or  -generator of the dataset's indices,   - -20 -00:01:36,560 --> 00:01:39,840 -and will then return a new Dataset  -object containing just those rows.   - -21 -00:01:41,280 --> 00:01:45,600 -If you want to create a random sample of rows,  -you can do this by chaining the shuffle and select   - -22 -00:01:45,600 --> 00:01:51,120 -methods together. In this example, we've created  -a sample of 5 elements from the SQuAD dataset.   - -23 -00:01:53,280 --> 00:01:57,360 -The last way to pick out specific rows in  -a dataset is by applying the filter method.   - -24 -00:01:58,080 --> 00:02:01,360 -This method checks whether each  -rows fulfills some condition or not.   - -25 -00:02:02,080 --> 00:02:05,840 -For example, here we've created a small  -lambda function that checks whether the   - -26 -00:02:05,840 --> 00:02:10,800 -title starts with the letter "L". Once we  -apply this function with the filter method,   - -27 -00:02:10,800 --> 00:02:13,840 -we get a subset of the data  -consisting of just these titles.   - -28 -00:02:16,080 --> 00:02:19,360 -So far we've been talking about the rows  -of a dataset, but what about the columns?   - -29 -00:02:20,240 --> 00:02:23,280 -The Datasets library has two main  -methods for transforming columns:   - -30 -00:02:23,840 --> 00:02:26,480 -a rename_column method to  -change the name of a column,   - -31 -00:02:26,480 --> 00:02:31,360 -and a remove_columns method to delete them.  -You can see examples of both these method here.   - -32 -00:02:34,000 --> 00:02:38,400 -Some datasets have nested columns and you can  -expand these by applying the flatten method.   - -33 -00:02:39,120 --> 00:02:44,240 -For example in the SQUAD dataset, the answers  -column contains a text and answer_start field.   - -34 -00:02:44,960 --> 00:02:49,840 -If we want to promote them to their own separate  -columns, we can apply flatten as shown here.   - -35 -00:02:51,280 --> 00:02:55,040 -Of course, no discussion of the Datasets  -library would be complete without mentioning the   - -36 -00:02:55,040 --> 00:03:00,240 -famous map method. This method applies a custom  -processing function to each row in the dataset.   - -37 -00:03:00,960 --> 00:03:06,480 -For example,here we first define a lowercase_title  -function that simply lowercases the text in the   - -38 -00:03:06,480 --> 00:03:13,760 -title column and then we feed that to the map  -method and voila! we now have lowercase titles.   - -39 -00:03:15,760 --> 00:03:19,280 -The map method can also be used to feed  -batches of rows to the processing function.   - -40 -00:03:19,840 --> 00:03:24,240 -This is especially useful for tokenization,  -where the tokenizers are backed by the Tokenizers   - -41 -00:03:24,240 --> 00:03:31,840 -library can use fast multithreading  -to process batches in parallel. +1 +00:00:00,215 --> 00:00:02,882 +(air whooshing) + +2 +00:00:05,760 --> 00:00:07,623 +- How to slice and dice the dataset? + +3 +00:00:08,760 --> 00:00:10,410 +Most of the time, the data you work with + +4 +00:00:10,410 --> 00:00:13,230 +won't be perfectly prepared +for training models. + +5 +00:00:13,230 --> 00:00:15,810 +In this video, we'll +explore various features + +6 +00:00:15,810 --> 00:00:18,660 +that the datasets library +provides to clean up your data. + +7 +00:00:19,915 --> 00:00:22,500 +The datasets library provides +several built-in methods + +8 +00:00:22,500 --> 00:00:25,350 +that allow you to wrangle +your data in various ways. + +9 +00:00:25,350 --> 00:00:27,360 +In this video, we'll +see how you can shuffle + +10 +00:00:27,360 --> 00:00:30,750 +and split your data, select +the rows you're interested in, + +11 +00:00:30,750 --> 00:00:32,070 +tweak the columns, + +12 +00:00:32,070 --> 00:00:34,620 +and apply processing +functions with the map method. + +13 +00:00:35,640 --> 00:00:37,620 +Let's start with shuffling. + +14 +00:00:37,620 --> 00:00:38,520 +It is generally a good idea + +15 +00:00:38,520 --> 00:00:40,140 +to apply shuffling to your training set + +16 +00:00:40,140 --> 00:00:41,250 +so that your model doesn't learn + +17 +00:00:41,250 --> 00:00:43,590 +any artificial ordering the data. + +18 +00:00:43,590 --> 00:00:45,360 +If you wanna shuffle the whole dataset, + +19 +00:00:45,360 --> 00:00:48,390 +you can apply the appropriately +named shuffle method. + +20 +00:00:48,390 --> 00:00:50,730 +You can see an example of +this method in action here, + +21 +00:00:50,730 --> 00:00:52,200 +where we've downloaded the training split + +22 +00:00:52,200 --> 00:00:55,000 +of the squad dataset and +shuffled all the rows randomly. + +23 +00:00:56,880 --> 00:00:58,230 +Another way to shuffle the data + +24 +00:00:58,230 --> 00:01:00,930 +is to create random train and test splits. + +25 +00:01:00,930 --> 00:01:02,280 +This can be useful if you have to create + +26 +00:01:02,280 --> 00:01:04,620 +your own test splits from raw data. + +27 +00:01:04,620 --> 00:01:07,620 +To do this, you just apply +the train_test_split method + +28 +00:01:07,620 --> 00:01:10,740 +and specify how large +the test split should be. + +29 +00:01:10,740 --> 00:01:14,310 +In this example, we specify +that the test set should be 10% + +30 +00:01:14,310 --> 00:01:15,963 +of the total dataset size. + +31 +00:01:16,890 --> 00:01:19,140 +You can see that the output +of the train_test_split method + +32 +00:01:19,140 --> 00:01:20,610 +is a DatasetDict object + +33 +00:01:20,610 --> 00:01:22,743 +whose keys correspond to the new splits. + +34 +00:01:25,170 --> 00:01:27,210 +Now that we know how +to shuffle the dataset, + +35 +00:01:27,210 --> 00:01:30,060 +let's take a look at returning +the rows we're interested in. + +36 +00:01:30,060 --> 00:01:33,180 +The most common way to do this +is with the select method. + +37 +00:01:33,180 --> 00:01:34,590 +This method expects a list + +38 +00:01:34,590 --> 00:01:36,750 +or a generator of the datasets indices, + +39 +00:01:36,750 --> 00:01:38,670 +and will then return a new dataset object + +40 +00:01:38,670 --> 00:01:40,143 +containing just those rows. + +41 +00:01:41,490 --> 00:01:43,740 +If you wanna create a +random sample of rows, + +42 +00:01:43,740 --> 00:01:45,360 +you can do this by chaining the shuffle + +43 +00:01:45,360 --> 00:01:47,310 +and select methods together. + +44 +00:01:47,310 --> 00:01:48,450 +In this example, + +45 +00:01:48,450 --> 00:01:50,250 +we've created a sample of five elements + +46 +00:01:50,250 --> 00:01:51,423 +from the squad dataset. + +47 +00:01:53,550 --> 00:01:56,010 +The last way to pick out +specific rows in a dataset + +48 +00:01:56,010 --> 00:01:58,290 +is by applying the filter method. + +49 +00:01:58,290 --> 00:02:00,120 +This method checks whether each row + +50 +00:02:00,120 --> 00:02:02,310 +fulfills some condition or not. + +51 +00:02:02,310 --> 00:02:05,130 +For example, here we've +created a small lambda function + +52 +00:02:05,130 --> 00:02:08,460 +that checks whether the title +starts with the letter L. + +53 +00:02:08,460 --> 00:02:11,040 +Once we apply this function +with the filter method, + +54 +00:02:11,040 --> 00:02:14,283 +we get a subset of the data +just containing these rows. + +55 +00:02:16,200 --> 00:02:18,600 +So far, we've been talking +about the rows of a dataset, + +56 +00:02:18,600 --> 00:02:20,490 +but what about the columns? + +57 +00:02:20,490 --> 00:02:22,320 +The datasets library has two main methods + +58 +00:02:22,320 --> 00:02:24,060 +for transforming columns, + +59 +00:02:24,060 --> 00:02:26,760 +a rename_column method to +change the name of the column + +60 +00:02:26,760 --> 00:02:29,460 +and a remove_columns +method to delete them. + +61 +00:02:29,460 --> 00:02:31,860 +You can see examples of +both these methods here. + +62 +00:02:34,140 --> 00:02:36,060 +Some datasets have nested columns, + +63 +00:02:36,060 --> 00:02:39,360 +and you can expand these by +applying the flatten method. + +64 +00:02:39,360 --> 00:02:41,430 +For example, in the squad dataset, + +65 +00:02:41,430 --> 00:02:45,150 +the answers column contains a +text and answer_start field. + +66 +00:02:45,150 --> 00:02:47,430 +If we wanna promote them to +their own separate columns, + +67 +00:02:47,430 --> 00:02:49,383 +we can apply flatten as shown here. + +68 +00:02:51,300 --> 00:02:53,760 +Now of course, no discussion +of the datasets library + +69 +00:02:53,760 --> 00:02:56,880 +would be complete without +mentioning the famous map method. + +70 +00:02:56,880 --> 00:02:59,160 +This method applies a +custom processing function + +71 +00:02:59,160 --> 00:03:01,140 +to each row in the dataset. + +72 +00:03:01,140 --> 00:03:03,360 +For example, here we first define + +73 +00:03:03,360 --> 00:03:04,890 +a lowercase title function, + +74 +00:03:04,890 --> 00:03:07,503 +that simply lowercases the +text in the title column. + +75 +00:03:08,640 --> 00:03:11,700 +And then we feed that +function to the map method, + +76 +00:03:11,700 --> 00:03:14,223 +and voila, we now have lowercase titles. + +77 +00:03:16,020 --> 00:03:18,360 +The map method can also be +used to feed batches of rows + +78 +00:03:18,360 --> 00:03:20,100 +to the processing function. + +79 +00:03:20,100 --> 00:03:22,410 +This is especially useful for tokenization + +80 +00:03:22,410 --> 00:03:25,290 +where the tokenizer is backed +by the Tokenizers library, + +81 +00:03:25,290 --> 00:03:26,910 +and they can use fast multithreading + +82 +00:03:26,910 --> 00:03:28,563 +to process batches in parallel. + +83 +00:03:30,056 --> 00:03:32,723 +(air whooshing) + diff --git "a/subtitles/en/37_datasets-+-dataframes-=-\342\235\244\357\270\217.srt" "b/subtitles/en/37_datasets-+-dataframes-=-\342\235\244\357\270\217.srt" index eb7ab2655..5204eac28 100644 --- "a/subtitles/en/37_datasets-+-dataframes-=-\342\235\244\357\270\217.srt" +++ "b/subtitles/en/37_datasets-+-dataframes-=-\342\235\244\357\270\217.srt" @@ -1,144 +1,283 @@ -1 -00:00:05,200 --> 00:00:11,680 -Datasets and DataFrames equals love. Although the  -processing functions of Datasets will cover most   - -2 -00:00:11,680 --> 00:00:15,600 -the cases needed to train a model, there are  -times when you’ll need to switch to a library   - -3 -00:00:15,600 --> 00:00:21,840 -like Pandas to access more powerful features or  -high-level APIs for visualisation. Fortunately,   - -4 -00:00:21,840 --> 00:00:25,520 -Datasets is designed to be interoperable  -with libraries like Pandas,   - -5 -00:00:25,520 --> 00:00:30,560 -as well as NumPy, PyTorch, TensorFlow,  -and JAX. In this video, we'll take a   - -6 -00:00:30,560 --> 00:00:33,920 -look at how we can quickly switch our  -data to Pandas DataFrames and back.   - -7 -00:00:35,920 --> 00:00:41,280 -As an example, let's suppose we're analysing  -Supreme Court cases from Switzerland. As usual   - -8 -00:00:41,280 --> 00:00:45,440 -we download our dataset from the Hub using the  -load_dataset() function, and you can see that the   - -9 -00:00:45,440 --> 00:00:49,600 -first element of the training set is an ordinary  -Python dictionary with various fields of interest.   - -10 -00:00:51,440 --> 00:00:54,800 -Now suppose that before we train any  -models, we'd like to explore the data a bit.   - -11 -00:00:55,360 --> 00:00:58,720 -For example we might be interested in  -knowing which legal area is most common   - -12 -00:00:59,600 --> 00:01:02,480 -or we might want to know how the  -languages are distributed across regions.   - -13 -00:01:04,320 --> 00:01:07,920 -Answering these questions with the native  -Arrow format isn't easy, but we can easily   - -14 -00:01:07,920 --> 00:01:13,280 -switch to Pandas to get our answers! The way  -this works is by using the set_format() method,   - -15 -00:01:13,280 --> 00:01:17,600 -which will change the output format of the dataset  -from Python dictionaries to Pandas DataFrames.   - -16 -00:01:18,720 --> 00:01:22,720 -As you can see in this example, each row in  -the dataset is represented as a DataFrame,   - -17 -00:01:22,720 --> 00:01:26,160 -so we can slice the whole dataset to  -get a single DataFrame of the dataset.   - -18 -00:01:27,840 --> 00:01:31,040 -The way this works under the hood is  -that the Datasets library changes the   - -19 -00:01:31,040 --> 00:01:35,440 -magic __getitem__() method of the dataset.  -The __getitem__() method is a special method   - -20 -00:01:35,440 --> 00:01:40,320 -for Python containers that allows you to  -specify how indexing works. In this case,   - -21 -00:01:40,320 --> 00:01:44,320 -the __getitem__() method of the raw dataset  -starts off by returning Python dictionaries   - -22 -00:01:45,120 --> 00:01:49,920 -and then after applying set_format() we change  -__getitem__() to return DataFrames instead.   - -23 -00:01:51,840 --> 00:01:56,240 -The Datasets library also provides a to_pandas()  -method if you want to do the format conversion and   - -24 -00:01:56,240 --> 00:02:02,640 -slicing of the dataset in one go. And once you  -have a DataFrame, you can find answers to all   - -25 -00:02:02,640 --> 00:02:07,840 -sorts of complex questions or make plots with your  -favourite visualisation library and so on. The   - -26 -00:02:07,840 --> 00:02:10,800 -only thing to remember is that once  -you are done with your Pandas analysis,   - -27 -00:02:10,800 --> 00:02:16,240 -you should reset the output format back to Arrow  -tables. If you don't, you can run into problems if   - -28 -00:02:16,240 --> 00:02:20,240 -you try to tokenize your text because it is no  -longer represented as strings in a dictionary.   - -29 -00:02:21,520 --> 00:02:32,160 -By resetting the output format, we get back  -Arrow tables and can tokenize without problem! +1 +00:00:00,227 --> 00:00:01,432 +(whooshing sound) + +2 +00:00:01,432 --> 00:00:02,420 +(sticker popping) + +3 +00:00:02,420 --> 00:00:05,340 +(whooshing sound) + +4 +00:00:05,340 --> 00:00:07,833 +- Datasets and DataFrames equals love. + +5 +00:00:08,790 --> 00:00:11,010 +Although the processing +functions of the Datasets library + +6 +00:00:11,010 --> 00:00:14,040 +will cover most of the cases +needed to train a model, + +7 +00:00:14,040 --> 00:00:15,660 +there are times when you'll +need to switch to a library + +8 +00:00:15,660 --> 00:00:18,240 +like Pandas to access +more powerful features + +9 +00:00:18,240 --> 00:00:20,970 +or high level APIs for visualization. + +10 +00:00:20,970 --> 00:00:23,220 +Fortunately, the Datasets +library is designed + +11 +00:00:23,220 --> 00:00:25,710 +to be interoperable with +libraries like Pandas, + +12 +00:00:25,710 --> 00:00:29,790 +as well as NumPy, PyTorch, +TensorFlow and JAX. + +13 +00:00:29,790 --> 00:00:30,930 +In this video, we'll take a look + +14 +00:00:30,930 --> 00:00:32,550 +at how we can quickly switch our data + +15 +00:00:32,550 --> 00:00:34,263 +to Pandas DataFrames and back. + +16 +00:00:36,120 --> 00:00:38,310 +As an example, let's +suppose we're analyzing + +17 +00:00:38,310 --> 00:00:40,830 +Supreme Court cases from Switzerland. + +18 +00:00:40,830 --> 00:00:43,020 +As usual, we download +our dataset from the hub + +19 +00:00:43,020 --> 00:00:44,940 +using the load_dataset function. + +20 +00:00:44,940 --> 00:00:46,980 +And you can see that the first +element of the training set + +21 +00:00:46,980 --> 00:00:48,510 +is an ordinary Python dictionary + +22 +00:00:48,510 --> 00:00:50,110 +with various fields of interest. + +23 +00:00:51,690 --> 00:00:53,670 +Now, suppose that before +we train any models, + +24 +00:00:53,670 --> 00:00:55,590 +we'd like to explore the data a bit. + +25 +00:00:55,590 --> 00:00:57,390 +For example, we might +be interested in knowing + +26 +00:00:57,390 --> 00:00:59,820 +which legal areas are the most common + +27 +00:00:59,820 --> 00:01:01,380 +or we might wanna know how the languages + +28 +00:01:01,380 --> 00:01:02,930 +are distributed across regions. + +29 +00:01:04,500 --> 00:01:05,333 +Answering these questions + +30 +00:01:05,333 --> 00:01:07,530 +with the native Arrow format isn't easy, + +31 +00:01:07,530 --> 00:01:10,500 +but we can quickly switch to +Pandas to get our answers. + +32 +00:01:10,500 --> 00:01:13,500 +The way this works is that by +using the set_format method, + +33 +00:01:13,500 --> 00:01:15,480 +we will change the output +format of the dataset + +34 +00:01:15,480 --> 00:01:18,930 +from Python dictionaries +to Pandas DataFrames. + +35 +00:01:18,930 --> 00:01:20,130 +As you can see in this example, + +36 +00:01:20,130 --> 00:01:22,890 +each row in the dataset is +represented as a DataFrame, + +37 +00:01:22,890 --> 00:01:24,540 +so we can slice the whole dataset + +38 +00:01:24,540 --> 00:01:26,583 +to get a single DataFrame of the corpus. + +39 +00:01:28,080 --> 00:01:29,520 +The way this works under the hood, + +40 +00:01:29,520 --> 00:01:31,080 +is that the datasets library changes + +41 +00:01:31,080 --> 00:01:33,900 +the magic __getitem__ +method of the dataset. + +42 +00:01:33,900 --> 00:01:35,640 +The __getitem__ method is a special method + +43 +00:01:35,640 --> 00:01:37,320 +for Python containers that allows you + +44 +00:01:37,320 --> 00:01:39,870 +to specify how indexing works. + +45 +00:01:39,870 --> 00:01:42,540 +In this case, the __getitem__ +method of the raw dataset + +46 +00:01:42,540 --> 00:01:45,150 +starts off by returning +a Python dictionary + +47 +00:01:45,150 --> 00:01:47,520 +and then after applying set_format, + +48 +00:01:47,520 --> 00:01:50,283 +we change __getitem__ to +return DataFrames instead. + +49 +00:01:52,080 --> 00:01:54,690 +The Datasets library also +provides a to_pandas method + +50 +00:01:54,690 --> 00:01:56,250 +if you wanna do the format conversion + +51 +00:01:56,250 --> 00:01:58,113 +and slicing of the dataset in one go. + +52 +00:02:00,090 --> 00:02:01,590 +And once you have a DataFrame, + +53 +00:02:01,590 --> 00:02:03,990 +you can find the answers to +all sorts of complex questions + +54 +00:02:03,990 --> 00:02:06,740 +or make plots with your +favorite visualization library. + +55 +00:02:07,890 --> 00:02:08,850 +The only thing to remember + +56 +00:02:08,850 --> 00:02:10,830 +is that once you're done +with your Pandas analysis, + +57 +00:02:10,830 --> 00:02:14,460 +you should reset the output +format back to Arrow tables. + +58 +00:02:14,460 --> 00:02:16,350 +If you don't, you can run into problems + +59 +00:02:16,350 --> 00:02:17,910 +if you try to tokenize your text + +60 +00:02:17,910 --> 00:02:19,260 +because it is no longer represented + +61 +00:02:19,260 --> 00:02:20,610 +as strings in a dictionary. + +62 +00:02:21,750 --> 00:02:24,780 +By resetting the output format +we get back Arrow tables + +63 +00:02:24,780 --> 00:02:26,580 +and we can tokenize without problem. + +64 +00:02:27,513 --> 00:02:30,346 +(whooshing sound) + diff --git a/subtitles/en/38_saving-and-reloading-a-dataset.srt b/subtitles/en/38_saving-and-reloading-a-dataset.srt index 79818d595..046a4b4f1 100644 --- a/subtitles/en/38_saving-and-reloading-a-dataset.srt +++ b/subtitles/en/38_saving-and-reloading-a-dataset.srt @@ -1,179 +1,359 @@ -1 -00:00:06,560 --> 00:00:11,600 -Saving and reloading a dataset. In this video  -we'll take a look saving a dataset in various   - -2 -00:00:11,600 --> 00:00:19,200 -formats, and explore the ways to reload the saved  -data. When you download a dataset, the processing   - -3 -00:00:19,200 --> 00:00:23,920 -scripts and data are stored locally on your  -computer. The cache allows the Datasets library   - -4 -00:00:23,920 --> 00:00:29,600 -to avoid re-downloading or processing the entire  -dataset every time you use it. The data is stored   - -5 -00:00:29,600 --> 00:00:34,080 -in the form of Arrow tables whose location can  -be found by accessing the dataset's cache_files   - -6 -00:00:34,080 --> 00:00:39,360 -attribute. In this example, we've downloaded  -the allocine dataset from the Hugging Face Hub   - -7 -00:00:39,360 --> 00:00:43,840 -and you can see there are three Arrow files  -stored in the cache, one for each split.   - -8 -00:00:45,120 --> 00:00:48,720 -But in many cases, you'll want to save your  -dataset in a different location or format.   - -9 -00:00:49,600 --> 00:00:53,760 -As shown in the table, the Datasets library  -provides four main functions to achieve this.   - -10 -00:00:54,880 --> 00:00:59,040 -You're probably familiar with the CSV and JSON  -formats, both of which are great if you want   - -11 -00:00:59,040 --> 00:01:04,800 -to save small to medium-sized datasets. But  -if your dataset is huge, you'll want to save   - -12 -00:01:04,800 --> 00:01:09,520 -it in either the Arrow or Parquet formats.  -Arrow files are great if you plan to reload   - -13 -00:01:09,520 --> 00:01:14,080 -or process the data in the near future. Parquet  -files are designed for long-term disk storage   - -14 -00:01:14,080 --> 00:01:17,440 -and are very space efficient. Let's  -take a closer look at each format.   - -15 -00:01:19,520 --> 00:01:25,520 -To save a Dataset or a DatasetDict object in the  -Arrow format we use the save_to_disk function. As   - -16 -00:01:25,520 --> 00:01:30,240 -you can see in this example, we simply provide the  -path we wish to save the data to, and the Datasets   - -17 -00:01:30,240 --> 00:01:34,720 -library will automatically create a directory for  -each split to store the Arrow table and metadata.   - -18 -00:01:35,600 --> 00:01:38,880 -Since we're dealing with a DatasetDict  -object that has multiple splits,   - -19 -00:01:38,880 --> 00:01:41,920 -this information is also stored  -in the dataset_dict.json file.   - -20 -00:01:44,160 --> 00:01:48,000 -Now when we want to reload the Arrow  -datasets, we use the load_from_disk function.   - -21 -00:01:48,640 --> 00:01:53,840 -We simply pass the path of our dataset directory  -and voila the original dataset is recovered!   - -22 -00:01:55,760 --> 00:01:59,920 -If we want to save our datasets in the  -CSV format we use the to_csv function.   - -23 -00:02:00,800 --> 00:02:05,280 -In this case you'll need to loop over the splits  -of the DatasetDict object and save each dataset as   - -24 -00:02:05,280 --> 00:02:11,280 -an individual CSV file. Since the to_csv file  -is based on the one from Pandas, you can pass   - -25 -00:02:11,280 --> 00:02:16,240 -keyword arguments to configure the output. In  -this example, we've set the index argument to   - -26 -00:02:16,240 --> 00:02:23,440 -None to prevent the dataset's index column from  -being included in the CSV files. To reload our CSV   - -27 -00:02:23,440 --> 00:02:29,760 -files, we use the load_dataset function together  -with the csv loading script and data_files   - -28 -00:02:29,760 --> 00:02:35,120 -argument which specifies the filenames associated  -with each split. As you can see in this example,   - -29 -00:02:35,120 --> 00:02:39,280 -by providing all the splits and their filenames,  -we've recovered the original DatasetDict object.   - -30 -00:02:41,840 --> 00:02:45,920 -To save a dataset in the JSON or Parquet  -formats is very similar to the CSV case.   - -31 -00:02:46,480 --> 00:02:52,720 -We use either the to_json function for JSON files  -or the to_parquet function for Parquet ones. And   - -32 -00:02:52,720 --> 00:02:57,440 -just like the CSV case, we need to loop over the  -splits and save each one as an individual file.   - -33 -00:02:59,680 --> 00:03:03,760 -Once our datasets are saved as JSON or  -Parquet files, we can reload them again   - -34 -00:03:03,760 --> 00:03:09,680 -with the appropriate script in the load_dataset  -function, and a data_files argument as before.   - -35 -00:03:10,640 --> 00:03:14,160 -This example shows how we can reload  -our saved datasets in either format.   - -36 -00:03:16,400 --> 00:03:26,000 -And with that you now know how to  -save your datasets in various formats! +1 +00:00:00,000 --> 00:00:02,917 +(transition music) + +2 +00:00:06,600 --> 00:00:08,283 +- Saving and reloading a dataset. + +3 +00:00:09,210 --> 00:00:10,320 +In this video, we'll take a look + +4 +00:00:10,320 --> 00:00:12,360 +at saving a dataset in various formats + +5 +00:00:12,360 --> 00:00:14,660 +and explore the ways to +reload the saved data. + +6 +00:00:17,310 --> 00:00:20,100 +When you download a dataset, +the processing scripts and data + +7 +00:00:20,100 --> 00:00:22,470 +are stored locally on your computer. + +8 +00:00:22,470 --> 00:00:24,000 +The cache allows the Datasets library + +9 +00:00:24,000 --> 00:00:25,230 +to avoid re-downloading + +10 +00:00:25,230 --> 00:00:28,620 +or processing the entire +dataset every time you use it. + +11 +00:00:28,620 --> 00:00:31,170 +Now, the data is stored in +the form of Arrow tables + +12 +00:00:31,170 --> 00:00:32,490 +whose location can be found + +13 +00:00:32,490 --> 00:00:35,730 +by accessing the dataset's +cache_files attribute. + +14 +00:00:35,730 --> 00:00:38,430 +In this example, we've +downloaded the allocine dataset + +15 +00:00:38,430 --> 00:00:40,080 +from the Hugging Face Hub, and you can see + +16 +00:00:40,080 --> 00:00:41,430 +that there are three Arrow files + +17 +00:00:41,430 --> 00:00:43,473 +stored in the cache, one for each split. + +18 +00:00:45,360 --> 00:00:47,460 +But in many cases, you'll +wanna save your dataset + +19 +00:00:47,460 --> 00:00:49,890 +in a different location or format. + +20 +00:00:49,890 --> 00:00:51,900 +As shown in the table, +the Datasets library + +21 +00:00:51,900 --> 00:00:54,870 +provides four main +functions to achieve this. + +22 +00:00:54,870 --> 00:00:56,130 +Now, you're probably already familiar + +23 +00:00:56,130 --> 00:00:58,770 +with the CSV and JSON formats, +both of which are great + +24 +00:00:58,770 --> 00:01:00,810 +if you just wanna quickly save a small + +25 +00:01:00,810 --> 00:01:02,790 +or medium-sized dataset. + +26 +00:01:02,790 --> 00:01:03,976 +But if your dataset is huge, + +27 +00:01:03,976 --> 00:01:07,860 +you'll wanna save it in either +the Arrow or Parquet formats. + +28 +00:01:07,860 --> 00:01:09,660 +Arrow files are great +if you plan to reload + +29 +00:01:09,660 --> 00:01:11,850 +or process the data in the near future. + +30 +00:01:11,850 --> 00:01:13,290 +While Parquet files are designed + +31 +00:01:13,290 --> 00:01:16,140 +for long-term storage and +are very space-efficient. + +32 +00:01:16,140 --> 00:01:18,140 +Let's take a closer look at each format. + +33 +00:01:19,800 --> 00:01:21,750 +To save a dataset or a dataset_dict object + +34 +00:01:21,750 --> 00:01:25,560 +in the Arrow format, we use +the save_to_disk function. + +35 +00:01:25,560 --> 00:01:26,910 +As you can see in this example, + +36 +00:01:26,910 --> 00:01:29,790 +we simply provide the path +we wish to save the data to + +37 +00:01:29,790 --> 00:01:30,720 +and the Datasets library + +38 +00:01:30,720 --> 00:01:32,340 +will automatically create a directory + +39 +00:01:32,340 --> 00:01:35,790 +for each split to store the +Arrow table and the metadata. + +40 +00:01:35,790 --> 00:01:37,680 +Since we're dealing with +a dataset_dict object + +41 +00:01:37,680 --> 00:01:39,090 +that has multiple splits, + +42 +00:01:39,090 --> 00:01:40,590 +this information is also stored + +43 +00:01:40,590 --> 00:01:42,243 +in the dataset_dict.json file. + +44 +00:01:44,250 --> 00:01:46,710 +Now, when we wanna reload +the Arrow datasets, + +45 +00:01:46,710 --> 00:01:48,870 +we use the load_from_disk function. + +46 +00:01:48,870 --> 00:01:51,210 +We simply pass the path +of our dataset directory, + +47 +00:01:51,210 --> 00:01:53,583 +and voila, the original +dataset is recovered. + +48 +00:01:55,594 --> 00:01:57,180 +If we wanna save our dataset + +49 +00:01:57,180 --> 00:02:00,990 +in the CSV format, we +use the to_csv function. + +50 +00:02:00,990 --> 00:02:02,280 +In this case, you'll need to loop + +51 +00:02:02,280 --> 00:02:04,170 +over the splits of the dataset_dict object + +52 +00:02:04,170 --> 00:02:07,710 +and save each dataset as +an individual CSV file. + +53 +00:02:07,710 --> 00:02:10,950 +Since the to_csv function is +based on the one from Pandas, + +54 +00:02:10,950 --> 00:02:13,980 +you can pass keyword arguments +to configure the output. + +55 +00:02:13,980 --> 00:02:16,230 +In this example, we've +set the index argument + +56 +00:02:16,230 --> 00:02:18,480 +to None to prevent the +dataset's index column + +57 +00:02:18,480 --> 00:02:20,553 +from being included in the CSV files. + +58 +00:02:22,470 --> 00:02:24,240 +To reload our CSV files, + +59 +00:02:24,240 --> 00:02:27,180 +we just then use the familiar +load_dataset function + +60 +00:02:27,180 --> 00:02:29,160 +together with the CSV loading script + +61 +00:02:29,160 --> 00:02:30,360 +and the data_files argument, + +62 +00:02:30,360 --> 00:02:34,020 +which specifies the file names +associated with each split. + +63 +00:02:34,020 --> 00:02:35,400 +As you can see in this example, + +64 +00:02:35,400 --> 00:02:37,320 +by providing all the splits +and their file names, + +65 +00:02:37,320 --> 00:02:39,770 +we've recovered the original +dataset_dict object. + +66 +00:02:41,880 --> 00:02:43,560 +Now, to save a dataset in the JSON + +67 +00:02:43,560 --> 00:02:46,710 +or Parquet formats is very +similar to the CSV case. + +68 +00:02:46,710 --> 00:02:49,890 +We use either the to_json +function for JSON files + +69 +00:02:49,890 --> 00:02:52,740 +or the to_parquet +function for Parquet ones. + +70 +00:02:52,740 --> 00:02:55,740 +And just like the CSV case, we +need to loop over the splits + +71 +00:02:55,740 --> 00:02:57,753 +to save each one as an individual file. + +72 +00:02:59,580 --> 00:03:02,940 +And once our datasets are +saved as JSON or Parquet files, + +73 +00:03:02,940 --> 00:03:03,990 +we can reload them again + +74 +00:03:03,990 --> 00:03:06,960 +with the appropriate script +in the load_dataset function. + +75 +00:03:06,960 --> 00:03:09,993 +And we just need to provide a +data_files argument as before. + +76 +00:03:10,860 --> 00:03:11,910 +This example shows + +77 +00:03:11,910 --> 00:03:14,560 +how we can reload our save +datasets in either format. + +78 +00:03:16,620 --> 00:03:17,970 +And with that, you now know + +79 +00:03:17,970 --> 00:03:20,220 +how to save your datasets +in various formats. + +80 +00:03:21,441 --> 00:03:24,358 +(transition music) + diff --git a/subtitles/en/39_memory-mapping-&-streaming.srt b/subtitles/en/39_memory-mapping-&-streaming.srt index 162665241..2efb51f8a 100644 --- a/subtitles/en/39_memory-mapping-&-streaming.srt +++ b/subtitles/en/39_memory-mapping-&-streaming.srt @@ -1,183 +1,370 @@ -1 -00:00:05,520 --> 00:00:10,720 -Memory mapping and streaming. In this video we'll  -take a look at two core features of the Datasets   - -2 -00:00:10,720 --> 00:00:15,920 -library that allow you to load and process huge  -datasets without blowing up your laptop's CPU. - -3 -00:00:18,160 --> 00:00:22,720 -Nowadays it is not uncommon to find yourself  -working with multi-GB sized datasets,   - -4 -00:00:22,720 --> 00:00:26,880 -especially if you’re planning to pretrain a  -transformer like BERT or GPT-2 from scratch.   - -5 -00:00:27,920 --> 00:00:30,480 -In these cases, even *loading*  -the data can be a challenge.   - -6 -00:00:31,040 --> 00:00:36,560 -For example, the C4 corpus used to -pretrain T5 consists of over 2 terabytes of data! - -7 -00:00:38,160 --> 00:00:42,720 -To handle these large datasets, the Datasets  -library is built on two core features:   - -8 -00:00:42,720 --> 00:00:45,120 -the Apache Arrow format and a streaming API.   - -9 -00:00:46,160 --> 00:00:51,120 -Arrow is designed for high-performance data  -processing and represents each table-like dataset   - -10 -00:00:51,120 --> 00:00:56,240 -with an in-memory columnar format. As you can  -see in this example, columnar formats group   - -11 -00:00:56,240 --> 00:01:01,280 -the elements of a table in consecutive blocks of  -RAM and this unlocks fast access and processing.   - -12 -00:01:02,560 --> 00:01:07,600 -Arrow is great at processing data at any scale,  -but some datasets are so large that you can't even   - -13 -00:01:07,600 --> 00:01:12,480 -fit them on your hard disk. For these cases,  -the Datasets library provides a streaming API   - -14 -00:01:13,040 --> 00:01:18,080 -that allows you to progressively download the  -raw data one element at a time. The result is   - -15 -00:01:18,080 --> 00:01:21,600 -a special object called an IterableDataset  -that we'll see in more detail soon.   - -16 -00:01:23,520 --> 00:01:28,160 -Let's start by looking at why Arrow is so  -powerful. The first feature is that it treat every   - -17 -00:01:28,160 --> 00:01:34,000 -dataset as a memory-mapped file. Memory mapping  -is a mechanism that maps a portion of a file or   - -18 -00:01:34,000 --> 00:01:38,967 -an entire file on disk to a chunk of virtual  -memory. This allows applications to access can   - -19 -00:01:38,967 --> 00:01:43,360 -access segments in an extremely large file without  -having to read the entire file into memory first.   - -20 -00:01:44,960 --> 00:01:49,040 -Another cool feature of Arrow's memory  -mapping capability is that it allows multiple   - -21 -00:01:49,040 --> 00:01:53,840 -processes to work with the same large dataset  -without moving it or copying it in any way.   - -22 -00:01:55,520 --> 00:01:59,920 -This "zero-copy" feature of Arrow makes it  -extremely fast for iterating over a dataset.   - -23 -00:02:00,480 --> 00:02:05,920 -In this example you can see that we iterate over  -15 million rows in about a minute using a standard   - -24 -00:02:05,920 --> 00:02:12,480 -laptop - that's not too bad at all! Let's now  -take a look at how we can stream a large dataset.   - -25 -00:02:12,480 --> 00:02:16,720 -The only change you need to make is to set the  -streaming=True argument in the load_dataset()   - -26 -00:02:16,720 --> 00:02:21,120 -function. This will return a special  -IterableDataset object, which is a bit different   - -27 -00:02:21,120 --> 00:02:26,160 -to the Dataset objects we've seen in other  -videos. This object is an iterable, which means   - -28 -00:02:26,160 --> 00:02:31,680 -we can't index it to access elements, but instead  -iterate on it using the iter and next methods.   - -29 -00:02:32,640 --> 00:02:36,080 -This will download and access a single  -example from the dataset, which means   - -30 -00:02:36,080 --> 00:02:39,760 -you can progressively iterate through a huge  -dataset without having to download it first.   - -31 -00:02:41,840 --> 00:02:47,040 -Tokenizing text with the map() method also works  -in a similar way. We first stream the dataset and   - -32 -00:02:47,040 --> 00:02:52,480 -then apply the map() method with the tokenizer. To  -get the first tokenized example we apply iter and   - -33 -00:02:52,480 --> 00:02:58,560 -next. The main difference with an IterableDataset  -is that instead of using the select() method to   - -34 -00:02:58,560 --> 00:03:04,240 -return example, we use the take() and skip()  -methods because we can't index into the dataset.   - -35 -00:03:04,240 --> 00:03:10,320 -The take() method returns the first N examples  -in the dataset, while skip() skips the first N   - -36 -00:03:10,320 --> 00:03:15,680 -and returns the rest. You can see examples  -of both in action here, where we create   - -37 -00:03:15,680 --> 00:03:27,040 -a validation set from the first 1000 examples  -and then skip those to create the training set. +1 +00:00:00,511 --> 00:00:01,784 +(air whooshing) + +2 +00:00:01,784 --> 00:00:02,964 +(logo popping) + +3 +00:00:02,964 --> 00:00:05,640 +(metal sliding) + +4 +00:00:05,640 --> 00:00:07,203 +- Memory mapping and streaming. + +5 +00:00:08,040 --> 00:00:09,180 +In this video, we'll take a look + +6 +00:00:09,180 --> 00:00:11,520 +at two core features +of the Datasets library + +7 +00:00:11,520 --> 00:00:14,220 +that allow you to load +and process huge datasets + +8 +00:00:14,220 --> 00:00:16,263 +without blowing up your laptop's CPU. + +9 +00:00:18,300 --> 00:00:20,280 +Nowadays, it's not +uncommon to find yourself + +10 +00:00:20,280 --> 00:00:22,950 +working with multi-GB sized datasets, + +11 +00:00:22,950 --> 00:00:24,420 +especially if you're planning to pretrain + +12 +00:00:24,420 --> 00:00:28,110 +a transformer like BERT +or GPT-2 from scratch. + +13 +00:00:28,110 --> 00:00:31,260 +In these cases, even loading +the data can be a challenge. + +14 +00:00:31,260 --> 00:00:34,680 +For example, the c4 +corpus used to pretrain T5 + +15 +00:00:34,680 --> 00:00:36,903 +consists of over two terabytes of data. + +16 +00:00:38,400 --> 00:00:40,050 +To handle these large datasets, + +17 +00:00:40,050 --> 00:00:42,990 +the Datasets library is +built on two core features: + +18 +00:00:42,990 --> 00:00:46,350 +the Apache Arrow format +and a streaming API. + +19 +00:00:46,350 --> 00:00:49,110 +Arrow is designed for +high-performance data processing + +20 +00:00:49,110 --> 00:00:51,360 +and represents each table-like dataset + +21 +00:00:51,360 --> 00:00:52,773 +with a column-based format. + +22 +00:00:53,730 --> 00:00:56,130 +As you can see in this +example, column-based formats + +23 +00:00:56,130 --> 00:00:59,280 +group the elements of a table +in consecutive blocks of RAM + +24 +00:00:59,280 --> 00:01:01,563 +and this unlocks fast +access and processing. + +25 +00:01:02,760 --> 00:01:05,550 +Arrow is great at +processing data at any scale + +26 +00:01:05,550 --> 00:01:07,110 +but some datasets are so large + +27 +00:01:07,110 --> 00:01:09,600 +that you can't even fit +them on your hard disk. + +28 +00:01:09,600 --> 00:01:11,730 +So for these cases, the +Datasets library provides + +29 +00:01:11,730 --> 00:01:14,820 +a streaming API that allows +you to progressively download + +30 +00:01:14,820 --> 00:01:17,700 +the raw data one element at a time. + +31 +00:01:17,700 --> 00:01:20,430 +The result is a special object +called an IterableDataset + +32 +00:01:20,430 --> 00:01:22,180 +that we'll see in more detail soon. + +33 +00:01:23,700 --> 00:01:26,670 +Let's start by looking at +why Arrow is so powerful. + +34 +00:01:26,670 --> 00:01:28,860 +The first feature is that +it treats every dataset + +35 +00:01:28,860 --> 00:01:30,153 +as a memory-mapped file. + +36 +00:01:31,020 --> 00:01:32,430 +Now, memory mapping is a mechanism + +37 +00:01:32,430 --> 00:01:35,400 +that maps a portion of a file +or an entire file and disc + +38 +00:01:35,400 --> 00:01:37,410 +to a chunk of virtual memory. + +39 +00:01:37,410 --> 00:01:38,520 +This allows applications + +40 +00:01:38,520 --> 00:01:41,280 +to access segments of +an extremely large file + +41 +00:01:41,280 --> 00:01:44,080 +without having to read the +whole file into memory first. + +42 +00:01:45,150 --> 00:01:48,120 +Another cool feature of Arrow's +memory mapping capabilities + +43 +00:01:48,120 --> 00:01:49,860 +is that it allows multiple processes + +44 +00:01:49,860 --> 00:01:51,840 +to work with the same large dataset + +45 +00:01:51,840 --> 00:01:54,333 +without moving it or +copying it in any way. + +46 +00:01:55,680 --> 00:01:57,570 +This zero-copy feature of Arrow + +47 +00:01:57,570 --> 00:02:00,600 +makes it extremely fast for +iterating over a dataset. + +48 +00:02:00,600 --> 00:02:02,640 +And this example, you +can see that we iterate + +49 +00:02:02,640 --> 00:02:05,160 +over 15 million rows in about a minute + +50 +00:02:05,160 --> 00:02:06,780 +just using a standard laptop. + +51 +00:02:06,780 --> 00:02:08,080 +That's not too bad at all. + +52 +00:02:09,750 --> 00:02:12,660 +Let's now take a look at how +we can stream a large dataset. + +53 +00:02:12,660 --> 00:02:14,520 +The only change you need to make is to set + +54 +00:02:14,520 --> 00:02:17,910 +the streaming=True argument in +the load_dataset() function. + +55 +00:02:17,910 --> 00:02:20,580 +This will return a special +IterableDataset object + +56 +00:02:20,580 --> 00:02:22,260 +which is a bit different +to the Dataset objects + +57 +00:02:22,260 --> 00:02:24,330 +we've seen in other videos. + +58 +00:02:24,330 --> 00:02:25,980 +This object is an iterable, + +59 +00:02:25,980 --> 00:02:28,530 +which means we can't index +it to access elements, + +60 +00:02:28,530 --> 00:02:30,180 +but instead we iterate on it + +61 +00:02:30,180 --> 00:02:32,850 +using the iter and next methods. + +62 +00:02:32,850 --> 00:02:34,050 +This will download and access + +63 +00:02:34,050 --> 00:02:35,850 +a single example from the dataset, + +64 +00:02:35,850 --> 00:02:37,410 +which means you can progressively iterate + +65 +00:02:37,410 --> 00:02:40,360 +through a huge dataset without +having to download it first. + +66 +00:02:42,150 --> 00:02:43,590 +Tokenizing text with a map() method + +67 +00:02:43,590 --> 00:02:45,660 +also works in a similar way. + +68 +00:02:45,660 --> 00:02:47,160 +We first stream the dataset + +69 +00:02:47,160 --> 00:02:49,830 +and then apply the map() +method with the tokenizer. + +70 +00:02:49,830 --> 00:02:53,283 +To get the first tokenized +example, we apply iter and next. + +71 +00:02:54,750 --> 00:02:57,210 +The main difference with +an IterableDataset is that + +72 +00:02:57,210 --> 00:02:59,970 +instead of using a select() +method to return examples, + +73 +00:02:59,970 --> 00:03:01,530 +we use the take() and skip() methods + +74 +00:03:01,530 --> 00:03:03,573 +because we can't index into the dataset. + +75 +00:03:04,470 --> 00:03:05,460 +The take() method returns + +76 +00:03:05,460 --> 00:03:07,500 +the first N examples in the dataset, + +77 +00:03:07,500 --> 00:03:09,270 +while skip(), as you can imagine, + +78 +00:03:09,270 --> 00:03:12,480 +skips the first N and returns the rest. + +79 +00:03:12,480 --> 00:03:15,300 +You can see examples of both +of these methods in action + +80 +00:03:15,300 --> 00:03:16,710 +where we create a validation set + +81 +00:03:16,710 --> 00:03:18,660 +from the first 1000 examples + +82 +00:03:18,660 --> 00:03:21,010 +and then skip those to +create the training set. + +83 +00:03:23,012 --> 00:03:25,762 +(air whooshing) + diff --git a/subtitles/en/40_uploading-a-dataset-to-the-hub.srt b/subtitles/en/40_uploading-a-dataset-to-the-hub.srt index 3f3caec00..ee571bef0 100644 --- a/subtitles/en/40_uploading-a-dataset-to-the-hub.srt +++ b/subtitles/en/40_uploading-a-dataset-to-the-hub.srt @@ -1,109 +1,228 @@ -1 -00:00:07,760 --> 00:00:11,760 -In this video we'll take a look at how you  -upload your very own dataset to the Hub.   - -2 -00:00:13,520 --> 00:00:16,560 -The first you'll need to do is create  -a new dataset repository on the Hub.   - -3 -00:00:17,360 --> 00:00:20,480 -Just click on your profile icon and  -select the "New Dataset" button.   - -4 -00:00:21,600 --> 00:00:26,720 -Next we need to assign an owner of the dataset.  -By default, this will be your Hub account,   - -5 -00:00:26,720 --> 00:00:29,840 -but you can also create datasets under  -any organisation that you belong to.   - -6 -00:00:30,720 --> 00:00:36,160 -Then we just need to give the dataset a name and  -specify whether it is a public or private dataset.   - -7 -00:00:37,200 --> 00:00:41,520 -Public datasets can be accessed by anyone,  -while private datasets can only be accessed   - -8 -00:00:41,520 --> 00:00:46,800 -by you or members of your organisation. And with  -that we can go ahead and create the dataset!   - -9 -00:00:48,480 --> 00:00:52,800 -Now that you have an empty dataset repository on  -the Hub, the next thing to do is add some data   - -10 -00:00:52,800 --> 00:00:59,360 -to it! You can do this with Git, but the easiest  -way is by selecting "Upload file" and uploading   - -11 -00:00:59,360 --> 00:01:04,880 -the files directly from your machine. After you've  -uploaded the files, you'll see them appear in the   - -12 -00:01:04,880 --> 00:01:11,360 -repository under the "Files and versions" tab.  -The last step is to create a dataset card. Well   - -13 -00:01:11,360 --> 00:01:14,160 -documented datasets are more likely to be useful  -to others (including your future self!) as they   - -14 -00:01:14,160 --> 00:01:18,400 -provide the context to decide whether the dataset  -is relevant or whether there are any biases or   - -15 -00:01:18,400 --> 00:01:23,680 -risks associated with using the dataset. On the  -Hugging Face Hub, this information is stored in   - -16 -00:01:23,680 --> 00:01:29,440 -each repository’s README.md file and there are  -two main steps you should take. First you need   - -17 -00:01:29,440 --> 00:01:33,360 -to create some metadata that will allow your  -dataset to be easily found by others on the Hub.   - -18 -00:01:34,400 --> 00:01:38,560 -You can create this metadata using the Datasets  -Tagging Application which we'll link to in the   - -19 -00:01:38,560 --> 00:01:43,040 -video description. Once you have created the  -metadata, you can fill out the rest of the   - -20 -00:01:43,040 --> 00:01:49,200 -dataset card and we provide a template that is  -also linked in the video. And once your dataset   - -21 -00:01:49,200 --> 00:01:53,680 -is up on the Hub, you can load it using the  -trusty load_dataset() function! Just provide   - -22 -00:01:53,680 --> 00:02:04,000 -the name of your repository and a data_files  -argument for the files and you're good to go! +1 +00:00:00,000 --> 00:00:02,917 +(transition music) + +2 +00:00:05,490 --> 00:00:07,950 +- Uploading a dataset to the hub. + +3 +00:00:07,950 --> 00:00:09,060 +In this video, we'll take a look + +4 +00:00:09,060 --> 00:00:10,860 +at how you can upload +your very own dataset + +5 +00:00:10,860 --> 00:00:12,060 +to the Hugging Face Hub. + +6 +00:00:13,680 --> 00:00:14,670 +The first thing you need to do + +7 +00:00:14,670 --> 00:00:17,400 +is create a new dataset +repository on the hub. + +8 +00:00:17,400 --> 00:00:19,260 +So, just click on your profile icon + +9 +00:00:19,260 --> 00:00:21,750 +and select the New Dataset button. + +10 +00:00:21,750 --> 00:00:24,750 +Next, we need to assign +an owner of the dataset. + +11 +00:00:24,750 --> 00:00:26,970 +By default, this will be your hub account, + +12 +00:00:26,970 --> 00:00:28,170 +but you can also create datasets + +13 +00:00:28,170 --> 00:00:30,585 +under any organization that you belong to. + +14 +00:00:30,585 --> 00:00:33,780 +Then, we just need to give +the dataset a good name + +15 +00:00:33,780 --> 00:00:36,513 +and specify whether it is a +public or private dataset. + +16 +00:00:37,410 --> 00:00:39,810 +Public datasets can be accessed by anyone + +17 +00:00:39,810 --> 00:00:41,670 +while private datasets +can only be accessed + +18 +00:00:41,670 --> 00:00:43,653 +by you or members of your organization. + +19 +00:00:44,580 --> 00:00:47,280 +And with that, we can go +ahead and create the dataset. + +20 +00:00:48,690 --> 00:00:51,060 +Now that you have an empty +dataset repository on the hub, + +21 +00:00:51,060 --> 00:00:53,880 +the next thing to do is +add some actual data to it. + +22 +00:00:53,880 --> 00:00:55,050 +You can do this with git, + +23 +00:00:55,050 --> 00:00:57,960 +but the easiest way is by +selecting the Upload file button. + +24 +00:00:57,960 --> 00:00:59,160 +And then, you can just go ahead + +25 +00:00:59,160 --> 00:01:02,243 +and upload the files +directly from your machine. + +26 +00:01:02,243 --> 00:01:03,846 +After you've uploaded your files, + +27 +00:01:03,846 --> 00:01:05,670 +you'll see them appear in the repository + +28 +00:01:05,670 --> 00:01:07,320 +under the Files and versions tab. + +29 +00:01:08,550 --> 00:01:11,370 +The last step is to create a dataset card. + +30 +00:01:11,370 --> 00:01:13,590 +Well-documented datasets +are more likely to be useful + +31 +00:01:13,590 --> 00:01:15,600 +to others as they provide +the context to decide + +32 +00:01:15,600 --> 00:01:17,370 +whether the dataset is relevant + +33 +00:01:17,370 --> 00:01:18,450 +or whether there are any biases + +34 +00:01:18,450 --> 00:01:20,673 +or risks associated +with using the dataset. + +35 +00:01:21,540 --> 00:01:22,710 +On the Hugging Face Hub, + +36 +00:01:22,710 --> 00:01:25,650 +this information is stored in +each repositories README file. + +37 +00:01:25,650 --> 00:01:27,988 +There are two main steps +that you should take. + +38 +00:01:27,988 --> 00:01:30,651 +First, you need to create some metadata + +39 +00:01:30,651 --> 00:01:32,010 +that will allow your dataset + +40 +00:01:32,010 --> 00:01:34,590 +to be easily found by others on the hub. + +41 +00:01:34,590 --> 00:01:35,670 +You can create this metadata + +42 +00:01:35,670 --> 00:01:37,860 +using the datasets tagging application, + +43 +00:01:37,860 --> 00:01:40,620 +which we'll link to in +the video description. + +44 +00:01:40,620 --> 00:01:42,240 +Once you've created the metadata, + +45 +00:01:42,240 --> 00:01:44,190 +you can fill out the +rest of the dataset card, + +46 +00:01:44,190 --> 00:01:45,240 +and we provide a template + +47 +00:01:45,240 --> 00:01:47,090 +that we'll also link to in the video. + +48 +00:01:48,480 --> 00:01:50,280 +And once your dataset is on the hub, + +49 +00:01:50,280 --> 00:01:53,400 +you can load it using the +trusty load_dataset function. + +50 +00:01:53,400 --> 00:01:55,015 +Just provide the name of your repository + +51 +00:01:55,015 --> 00:01:57,843 +and a data_files argument, +and you're good to go. + +52 +00:01:59,619 --> 00:02:02,536 +(transition music) + diff --git a/subtitles/en/41_text-embeddings-&-semantic-search.srt b/subtitles/en/41_text-embeddings-&-semantic-search.srt index 128d54a4f..51c9d9b29 100644 --- a/subtitles/en/41_text-embeddings-&-semantic-search.srt +++ b/subtitles/en/41_text-embeddings-&-semantic-search.srt @@ -1,184 +1,368 @@ -1 -00:00:05,520 --> 00:00:11,200 -Text embeddings and semantic search. In this video  -we’ll explore how Transformer models represent   - -2 -00:00:11,200 --> 00:00:15,920 -text as embedding vectors and how these vectors  -can be used to find similar documents in a corpus.   - -3 -00:00:17,520 --> 00:00:22,000 -Text embeddings are just a fancy way of saying  -that we can represent text as an array of numbers   - -4 -00:00:22,000 --> 00:00:27,120 -called a vector. To create these embeddings we  -usually use an encoder-based model like BERT.   - -5 -00:00:28,320 --> 00:00:32,320 -In this example, you can see how we feed  -three sentences to the encoder and get   - -6 -00:00:32,320 --> 00:00:36,400 -three vectors as the output. Reading  -the text, we can see that walking the   - -7 -00:00:36,400 --> 00:00:40,880 -dog seems to be most similar to walking the  -cat, but let's see if we can quantify this!   - -8 -00:00:42,560 --> 00:00:46,080 -The trick to do the comparison is to  -compute a similarity metric between each   - -9 -00:00:46,080 --> 00:00:50,880 -pair of embedding vectors. These vectors  -usually live in a high-dimensional space,   - -10 -00:00:50,880 --> 00:00:54,640 -so a similarity metric can be anything that  -measures some sort of distance between vectors.   - -11 -00:00:55,520 --> 00:01:00,560 -One popular metric is cosine similarity, which  -uses the angle between two vectors to measure   - -12 -00:01:00,560 --> 00:01:06,160 -how close they are. In this example, our embedding  -vectors live in 3D and we can see that the orange   - -13 -00:01:06,160 --> 00:01:12,080 -and grey vectors are close to each other and have  -a smaller angle. Now one problem we have to deal   - -14 -00:01:12,080 --> 00:01:16,640 -with is that Transformer models like BERT will  -actually return one embedding vector per token.   - -15 -00:01:17,680 --> 00:01:22,560 -For example in the sentence "I took my dog for a  -walk", we can expect several embedding vectors,   - -16 -00:01:22,560 --> 00:01:28,880 -one for each word. For example, here we can see  -the output of our model has produced 9 embedding   - -17 -00:01:28,880 --> 00:01:35,200 -vectors per sentence, and each vector has 384  -dimensions. But what we really want is a single   - -18 -00:01:35,200 --> 00:01:41,040 -embedding vector for the whole sentence. To deal  -with this, we can use a technique called pooling.   - -19 -00:01:41,760 --> 00:01:45,840 -The simplest pooling method is to just  -take the token embedding of the CLS token.   - -20 -00:01:46,880 --> 00:01:50,160 -Alternatively, we can average the  -token embeddings which is called   - -21 -00:01:50,160 --> 00:01:56,400 -mean pooling. With mean pooling only thing  -we need to make sure is that we don't include   - -22 -00:01:56,400 --> 00:02:00,640 -the padding tokens in the average, which is why  -you can see the attention mask being used here.   - -23 -00:02:01,680 --> 00:02:07,160 -This now gives us one 384 dimensional vector  -per sentence which is exactly what we want! And   - -24 -00:02:07,840 --> 00:02:12,240 -once we have our sentence embeddings, we can  -compute the cosine similarity for each pair of   - -25 -00:02:12,240 --> 00:02:17,520 -vectors. In this example we use the function from  -scikit-learn and you can see that the sentence "I   - -26 -00:02:17,520 --> 00:02:22,400 -took my dog for a walk" has an overlap of 0.83  -with "I took my cat for a walk". Hooray! We   - -27 -00:02:25,040 --> 00:02:29,600 -can take this idea one step further by comparing  -the similarity between a question and a corpus   - -28 -00:02:29,600 --> 00:02:36,000 -of documents. For example, suppose we embed every  -post in the Hugging Face forums. We can then ask a   - -29 -00:02:36,000 --> 00:02:41,600 -question, embed it, and check which forum posts  -are most similar. This process is often called   - -30 -00:02:41,600 --> 00:02:48,000 -semantic search, because it allows us to compare  -queries with context. To create a semantic search   - -31 -00:02:48,000 --> 00:02:54,400 -engine is quite simple in Datasets. First we  -need to embed all the documents. In this example,   - -32 -00:02:54,400 --> 00:02:59,120 -we take a small sample from the SQUAD dataset  -and apply the same embedding logic as before.   - -33 -00:03:00,000 --> 00:03:03,840 -This gives us a new column called "embeddings"  -that stores the embedding of every passage.   - -34 -00:03:05,680 --> 00:03:09,280 -Once we have our embeddings, we need a  -way to find nearest neighbours to a query.   - -35 -00:03:10,080 --> 00:03:14,320 -Datasets provides a special object called a  -FAISS index that allows you to quickly compare   - -36 -00:03:14,320 --> 00:03:18,880 -embedding vectors. So we add the  -FAISS index, embed a question and   - -37 -00:03:18,880 --> 00:03:29,360 -voila! we've now found the 3 most similar  -articles which might store the answer. +1 +00:00:00,621 --> 00:00:03,204 +(upbeat music) + +2 +00:00:05,670 --> 00:00:08,520 +- Text embeddings and semantic search. + +3 +00:00:08,520 --> 00:00:10,770 +In this video we'll explore +how Transformer models + +4 +00:00:10,770 --> 00:00:12,810 +represent text as embedding vectors + +5 +00:00:12,810 --> 00:00:15,420 +and how these vectors can be +used to find similar documents + +6 +00:00:15,420 --> 00:00:16,293 +in a corpus. + +7 +00:00:17,730 --> 00:00:19,890 +Text embeddings are just +a fancy way of saying + +8 +00:00:19,890 --> 00:00:22,170 +that we can represent text +as an array of numbers + +9 +00:00:22,170 --> 00:00:23,640 +called a vector. + +10 +00:00:23,640 --> 00:00:25,710 +To create these embeddings we usually use + +11 +00:00:25,710 --> 00:00:27,393 +an encoder-based model like BERT. + +12 +00:00:28,530 --> 00:00:31,290 +In this example, you can see +how we feed three sentences + +13 +00:00:31,290 --> 00:00:34,830 +to the encoder and get +three vectors as the output. + +14 +00:00:34,830 --> 00:00:37,050 +Reading the text, we can +see that walking the dog + +15 +00:00:37,050 --> 00:00:39,450 +seems to be most similar +to walking the cat, + +16 +00:00:39,450 --> 00:00:41,350 +but let's see if we can quantify this. + +17 +00:00:42,810 --> 00:00:44,040 +The trick to do the comparison + +18 +00:00:44,040 --> 00:00:45,630 +is to compute a similarity metric + +19 +00:00:45,630 --> 00:00:48,210 +between each pair of embedding vectors. + +20 +00:00:48,210 --> 00:00:51,120 +These vectors usually live in +a very high-dimensional space, + +21 +00:00:51,120 --> 00:00:53,190 +so a similarity metric can +be anything that measures + +22 +00:00:53,190 --> 00:00:55,740 +some sort of distance between vectors. + +23 +00:00:55,740 --> 00:00:58,560 +One very popular metric +is cosine similarity, + +24 +00:00:58,560 --> 00:01:00,390 +which uses the angle between two vectors + +25 +00:01:00,390 --> 00:01:02,610 +to measure how close they are. + +26 +00:01:02,610 --> 00:01:05,250 +In this example, our +embedding vectors live in 3D + +27 +00:01:05,250 --> 00:01:07,110 +and we can see that the +orange and Grey vectors + +28 +00:01:07,110 --> 00:01:09,560 +are close to each other +and have a smaller angle. + +29 +00:01:11,130 --> 00:01:12,510 +Now one problem we have to deal with + +30 +00:01:12,510 --> 00:01:15,180 +is that Transformer models +like BERT will actually return + +31 +00:01:15,180 --> 00:01:16,983 +one embedding vector per token. + +32 +00:01:17,880 --> 00:01:20,700 +For example in the sentence, +"I took my dog for a walk," + +33 +00:01:20,700 --> 00:01:23,853 +we can expect several embedding +vectors, one for each word. + +34 +00:01:25,110 --> 00:01:27,870 +For example, here we can +see the output of our model + +35 +00:01:27,870 --> 00:01:30,540 +has produced 9 embedding +vectors per sentence, + +36 +00:01:30,540 --> 00:01:33,750 +and each vector has 384 dimensions. + +37 +00:01:33,750 --> 00:01:36,210 +But what we really want is +a single embedding vector + +38 +00:01:36,210 --> 00:01:37,353 +for each sentence. + +39 +00:01:38,940 --> 00:01:42,060 +To deal with this, we can use +a technique called pooling. + +40 +00:01:42,060 --> 00:01:43,050 +The simplest pooling method + +41 +00:01:43,050 --> 00:01:44,520 +is to just take the token embedding + +42 +00:01:44,520 --> 00:01:46,203 +of the special CLS token. + +43 +00:01:47,100 --> 00:01:49,650 +Alternatively, we can +average the token embeddings + +44 +00:01:49,650 --> 00:01:52,500 +which is called mean pooling +and this is what we do here. + +45 +00:01:53,370 --> 00:01:55,800 +With mean pooling the only +thing we need to make sure + +46 +00:01:55,800 --> 00:01:58,410 +is that we don't include the +padding tokens in the average, + +47 +00:01:58,410 --> 00:02:01,860 +which is why you can see the +attention mask being used here. + +48 +00:02:01,860 --> 00:02:05,100 +This gives us a 384 dimensional +vector for each sentence + +49 +00:02:05,100 --> 00:02:06,600 +which is exactly what we want. + +50 +00:02:07,920 --> 00:02:09,810 +And once we have our sentence embeddings, + +51 +00:02:09,810 --> 00:02:11,730 +we can compute the cosine similarity + +52 +00:02:11,730 --> 00:02:13,113 +for each pair of vectors. + +53 +00:02:13,993 --> 00:02:16,350 +In this example we use the +function from scikit-learn + +54 +00:02:16,350 --> 00:02:19,140 +and you can see that the sentence +"I took my dog for a walk" + +55 +00:02:19,140 --> 00:02:22,140 +has indeed a strong overlap +with "I took my cat for a walk". + +56 +00:02:22,140 --> 00:02:23,240 +Hooray! We've done it. + +57 +00:02:25,110 --> 00:02:27,180 +We can actually take this +idea one step further + +58 +00:02:27,180 --> 00:02:29,220 +by comparing the similarity +between a question + +59 +00:02:29,220 --> 00:02:31,170 +and a corpus of documents. + +60 +00:02:31,170 --> 00:02:33,810 +For example, suppose we embed every post + +61 +00:02:33,810 --> 00:02:35,430 +in the Hugging Face forums. + +62 +00:02:35,430 --> 00:02:37,800 +We can then ask a question, embed it, + +63 +00:02:37,800 --> 00:02:40,590 +and check which forum +posts are most similar. + +64 +00:02:40,590 --> 00:02:42,750 +This process is often +called semantic search, + +65 +00:02:42,750 --> 00:02:45,423 +because it allows us to +compare queries with context. + +66 +00:02:47,040 --> 00:02:48,450 +To create a semantic search engine + +67 +00:02:48,450 --> 00:02:51,030 +is actually quite simple +in the datasets library. + +68 +00:02:51,030 --> 00:02:53,340 +First we need to embed all the documents. + +69 +00:02:53,340 --> 00:02:56,070 +And in this example, +we take a small sample + +70 +00:02:56,070 --> 00:02:57,780 +from the SQUAD dataset and apply + +71 +00:02:57,780 --> 00:03:00,180 +the same embedding logic as before. + +72 +00:03:00,180 --> 00:03:02,280 +This gives us a new +column called embeddings, + +73 +00:03:02,280 --> 00:03:04,530 +which stores the embeddings +of every passage. + +74 +00:03:05,880 --> 00:03:07,260 +Once we have our embeddings, + +75 +00:03:07,260 --> 00:03:10,200 +we need a way to find nearest +neighbors for a query. + +76 +00:03:10,200 --> 00:03:13,170 +The datasets library provides +a special object called FAISS + +77 +00:03:13,170 --> 00:03:16,080 +which allows you to quickly +compare embedding vectors. + +78 +00:03:16,080 --> 00:03:19,950 +So we add the FAISS index, +embed a question and voila, + +79 +00:03:19,950 --> 00:03:21,870 +we've now found the 3 +most similar articles + +80 +00:03:21,870 --> 00:03:23,320 +which might store the answer. + +81 +00:03:25,182 --> 00:03:27,849 +(upbeat music) + diff --git a/subtitles/en/42_training-a-new-tokenizer.srt b/subtitles/en/42_training-a-new-tokenizer.srt index 00255f7b1..d2d2ded51 100644 --- a/subtitles/en/42_training-a-new-tokenizer.srt +++ b/subtitles/en/42_training-a-new-tokenizer.srt @@ -1,322 +1,512 @@ -1 -00:00:05,310 --> 00:00:12,220 -In this video we will see together what is -the purpose of training a tokenizer, what - -2 -00:00:12,220 --> 00:00:18,770 -are the key steps to follow and what is the -easiest way to do it. - -3 -00:00:18,770 --> 00:00:23,039 -You will ask yourself the question "Should -I train a new tokenizer?" - -4 -00:00:23,039 --> 00:00:27,369 -when you plan to train a new model from scratch. - -5 -00:00:27,369 --> 00:00:36,600 -A trained tokenizer would not be suitable -for your corpus if your corpus is in a different - -6 -00:00:36,600 --> 00:00:43,640 -language, uses new characters such as accents -or upper cased letters, has a specific vocabulary, - -7 -00:00:43,640 --> 00:00:50,980 -for example medical or legal, or uses a different -style, a language from another century for - -8 -00:00:50,980 --> 00:00:51,989 -instance. - -9 -00:00:51,989 --> 00:01:00,719 -For example, if I take the tokenizer trained -for the bert-base-uncased model and ignore - -10 -00:01:00,719 --> 00:01:08,580 -its normalization step then we can see that -the tokenization operation on the English - -11 -00:01:08,580 --> 00:01:14,310 -sentence "here is a sentence adapted to our -tokenizer" produces a rather satisfactory - -12 -00:01:14,310 --> 00:01:20,820 -list of tokens in the sense that this sentence -of 8 words is tokenized into 9 tokens. - -13 -00:01:20,820 --> 00:01:29,909 -On the other hand if we use this same tokenizer -on a sentence in Bengali, we see that either - -14 -00:01:29,909 --> 00:01:36,320 -a word is divided into many sub tokens or -that the tokenizer does not know one of the - -15 -00:01:36,320 --> 00:01:41,359 -unicode characters and returns only an unknown -token. - -16 -00:01:41,359 --> 00:01:47,350 -The fact that a "common" word is split into -many tokens can be problematic because language - -17 -00:01:47,350 --> 00:01:52,750 -models can only handle a sequence of tokens -of limited length. - -18 -00:01:52,750 --> 00:01:59,290 -A tokenizer that excessively splits your initial -text may even impact the performance of your - -19 -00:01:59,290 --> 00:02:00,290 -model. - -20 -00:02:00,290 --> 00:02:05,060 -Unknown tokens are also problematic because -the model will not be able to extract any - -21 -00:02:05,060 --> 00:02:11,440 -information from the "unknown" part of the -text. - -22 -00:02:11,440 --> 00:02:16,910 -In this other example, we can see that the -tokenizer replaces words containing characters - -23 -00:02:16,910 --> 00:02:21,230 -with accents and capital letters with unknown -tokens. - -24 -00:02:21,230 --> 00:02:28,140 -Finally, if we use again this tokenizer to -tokenize medical vocabulary we see again that - -25 -00:02:28,140 --> 00:02:37,349 -a single word is divided into many sub tokens: -4 for "paracetamol" and "pharyngitis". - -26 -00:02:37,349 --> 00:02:42,050 -Most of the tokenizers used by the current -state of the art language models need to be - -27 -00:02:42,050 --> 00:02:48,160 -trained on a corpus that is similar to the -one used to pre-train the language model. - -28 -00:02:48,160 --> 00:02:54,390 -This training consists in learning rules to -divide the text into tokens and the way to - -29 -00:02:54,390 --> 00:03:00,510 -learn these rules and use them depends on -the chosen tokenizer model. - -30 -00:03:00,510 --> 00:03:06,710 -Thus, to train a new tokenizer it is first -necessary to build a training corpus composed - -31 -00:03:06,710 --> 00:03:09,239 -of raw texts. - -32 -00:03:09,239 --> 00:03:13,440 -Then, you have to choose an architecture for -your tokenizer. - -33 -00:03:13,440 --> 00:03:19,640 -Here there are two options: the simplest is -to reuse the same architecture as the one - -34 -00:03:19,640 --> 00:03:26,760 -of a tokenizer used by another model already -trained,otherwise it is also possible to completely - -35 -00:03:26,760 --> 00:03:33,950 -design your tokenizer but it requires more -experience and attention. - -36 -00:03:33,950 --> 00:03:39,620 -Once the architecture is chosen, one can thus -train this tokenizer on your constituted corpus. - -37 -00:03:39,620 --> 00:03:44,870 -Finally, the last thing that you need to do -is to save the learned rules to be able to - -38 -00:03:44,870 --> 00:03:49,780 -use this tokenizer which is now ready to be -used. - -39 -00:03:49,780 --> 00:03:55,120 -Let's take an example: let's say you want -to train a GPT-2 model on Python code. - -40 -00:03:55,120 --> 00:04:03,000 -Even if the python code is in English this -type of text is very specific and deserves - -41 -00:04:03,000 --> 00:04:09,800 -a tokenizer trained on it - to convince you -of this we will see at the end the difference - -42 -00:04:09,800 --> 00:04:11,319 -produced on an example. - -43 -00:04:11,319 --> 00:04:18,889 -For that we are going to use the method "train_new_from_iterator" -that all the fast tokenizers of the library - -44 -00:04:18,889 --> 00:04:22,530 -have and thus in particular GPT2TokenizerFast. - -45 -00:04:22,530 --> 00:04:28,389 -This is the simplest method in our case to -have a tokenizer adapted to python code. - -46 -00:04:28,389 --> 00:04:34,229 -Remember, the first step is to gather a training -corpus. - -47 -00:04:34,229 --> 00:04:39,639 -We will use a subpart of the CodeSearchNet -dataset containing only python functions from - -48 -00:04:39,639 --> 00:04:42,039 -open source libraries on Github. - -49 -00:04:42,039 --> 00:04:48,890 -It's good timing, this dataset is known by -the datasets library and we can load it in - -50 -00:04:48,890 --> 00:04:51,190 -two lines of code. - -51 -00:04:51,190 --> 00:04:57,940 -Then, as the "train_new_from_iterator" method -expects a iterator of lists of texts we create - -52 -00:04:57,940 --> 00:05:04,030 -the "get_training_corpus" function which will -return an iterator. - -53 -00:05:04,030 --> 00:05:10,861 -Now that we have our iterator on our python -functions corpus, we can load the gpt-2 tokenizer - -54 -00:05:10,861 --> 00:05:12,490 -architecture. - -55 -00:05:12,490 --> 00:05:19,450 -Here "old_tokenizer" is not adapted to our -corpus but we only need one more line to train - -56 -00:05:19,450 --> 00:05:21,850 -it on our new corpus. - -57 -00:05:21,850 --> 00:05:29,310 -An argument that is common to most of the -tokenization algorithms used at the moment - -58 -00:05:29,310 --> 00:05:33,370 -is the size of the vocabulary, we choose here -the value 52 thousand. - -59 -00:05:33,370 --> 00:05:38,780 -Finally, once the training is finished, we -just have to save our new tokenizer locally - -60 -00:05:38,780 --> 00:05:45,430 -or send it to the hub to be able to reuse -it very easily afterwards. - -61 -00:05:45,430 --> 00:05:49,962 -Finally, let's see together on an example -if it was useful to re-train a tokenizer similar - -62 -00:05:49,962 --> 00:05:55,259 -to gpt2 one. - -63 -00:05:55,259 --> 00:06:01,610 -With the original tokenizer of GPT-2 we see -that all spaces are isolated and the method - -64 -00:06:01,610 --> 00:06:05,860 -name "randn" relatively common in python code -is split in 2. - -65 -00:06:05,860 --> 00:06:10,919 -With our new tokenizer, single and double -indentations have been learned and the method - -66 -00:06:10,919 --> 00:06:13,410 -"randn" is tokenized into 1 token. - -67 -00:06:13,410 --> 00:06:23,190 -And with that, you now know how to train your -very own tokenizers! +1 +00:00:00,000 --> 00:00:02,667 +(air whooshing) + +2 +00:00:05,310 --> 00:00:08,700 +- In this video we will see together + +3 +00:00:08,700 --> 00:00:11,820 +what is the purpose of +training a tokenizer, + +4 +00:00:11,820 --> 00:00:14,400 +what are the key steps to follow, + +5 +00:00:14,400 --> 00:00:16,953 +and what is the easiest way to do it. + +6 +00:00:18,690 --> 00:00:20,677 +You will ask yourself the question, + +7 +00:00:20,677 --> 00:00:23,040 +"Should I train a new tokenizer?", + +8 +00:00:23,040 --> 00:00:25,773 +when you plan to train a +new model from scratch. + +9 +00:00:29,520 --> 00:00:34,020 +A trained tokenizer would not +be suitable for your corpus + +10 +00:00:34,020 --> 00:00:37,080 +if your corpus is in a different language, + +11 +00:00:37,080 --> 00:00:42,060 +uses new characters, such as +accents or upper cased letters, + +12 +00:00:42,060 --> 00:00:47,060 +has a specific vocabulary, +for example medical or legal, + +13 +00:00:47,100 --> 00:00:49,050 +or uses a different style, + +14 +00:00:49,050 --> 00:00:51,873 +a language from another +century for example. + +15 +00:00:56,490 --> 00:00:58,320 +If I take the tokenizer trained on + +16 +00:00:58,320 --> 00:01:00,780 +the bert-base-uncased model, + +17 +00:01:00,780 --> 00:01:03,213 +and ignore its normalization step, + +18 +00:01:04,260 --> 00:01:07,650 +then we can see that the +tokenization operation + +19 +00:01:07,650 --> 00:01:09,277 +on the English sentence, + +20 +00:01:09,277 --> 00:01:12,480 +"Here is a sentence +adapted to our tokenizer", + +21 +00:01:12,480 --> 00:01:15,600 +produces a rather +satisfactory list of tokens, + +22 +00:01:15,600 --> 00:01:18,510 +in the sense that this +sentence of eight words + +23 +00:01:18,510 --> 00:01:20,643 +is tokenized into nine tokens. + +24 +00:01:22,920 --> 00:01:26,340 +On the other hand, if I +use this same tokenizer + +25 +00:01:26,340 --> 00:01:29,370 +on a sentence in Bengali, we see that + +26 +00:01:29,370 --> 00:01:33,690 +either a word is divided +into many sub tokens, + +27 +00:01:33,690 --> 00:01:36,270 +or that the tokenizer does not know one of + +28 +00:01:36,270 --> 00:01:39,873 +the unicode characters and +returns only an unknown token. + +29 +00:01:41,220 --> 00:01:44,970 +The fact that a common word +is split into many tokens + +30 +00:01:44,970 --> 00:01:47,910 +can be problematic, +because language models + +31 +00:01:47,910 --> 00:01:51,903 +can only handle a sequence +of tokens of limited length. + +32 +00:01:52,830 --> 00:01:55,830 +A tokenizer that excessively +splits your initial text + +33 +00:01:55,830 --> 00:01:58,503 +may even impact the +performance of your model. + +34 +00:01:59,760 --> 00:02:02,280 +Unknown tokens are also problematic, + +35 +00:02:02,280 --> 00:02:04,530 +because the model will +not be able to extract + +36 +00:02:04,530 --> 00:02:07,563 +any information from the +unknown part of the text. + +37 +00:02:11,430 --> 00:02:13,440 +In this other example, we can see that + +38 +00:02:13,440 --> 00:02:17,100 +the tokenizer replaces +words containing characters + +39 +00:02:17,100 --> 00:02:20,973 +with accents and capital +letters with unknown tokens. + +40 +00:02:22,050 --> 00:02:24,770 +Finally, if we use again this tokenizer + +41 +00:02:24,770 --> 00:02:28,170 +to tokenize medical +vocabulary, we see again that + +42 +00:02:28,170 --> 00:02:31,800 +a single word is divided +into many sub tokens, + +43 +00:02:31,800 --> 00:02:34,803 +four for paracetamol, +and four for pharyngitis. + +44 +00:02:37,110 --> 00:02:39,360 +Most of the tokenizers used by the current + +45 +00:02:39,360 --> 00:02:42,540 +state of the art language +models need to be trained + +46 +00:02:42,540 --> 00:02:45,360 +on a corpus that is +similar to the one used + +47 +00:02:45,360 --> 00:02:47,463 +to pre-train the language model. + +48 +00:02:49,140 --> 00:02:51,150 +This training consists in learning rules + +49 +00:02:51,150 --> 00:02:53,250 +to divide the text into tokens. + +50 +00:02:53,250 --> 00:02:56,160 +And the way to learn +these rules and use them + +51 +00:02:56,160 --> 00:02:58,233 +depends on the chosen tokenizer model. + +52 +00:03:00,630 --> 00:03:04,590 +Thus, to train a new tokenizer, +it is first necessary + +53 +00:03:04,590 --> 00:03:07,653 +to build a training corpus +composed of raw texts. + +54 +00:03:08,910 --> 00:03:12,423 +Then, you have to choose an +architecture for your tokenizer. + +55 +00:03:13,410 --> 00:03:14,763 +Here there are two options. + +56 +00:03:15,900 --> 00:03:19,710 +The simplest is to reuse the +same architecture as the one + +57 +00:03:19,710 --> 00:03:22,863 +of a tokenizer used by +another model already trained. + +58 +00:03:24,210 --> 00:03:25,980 +Otherwise it is also possible + +59 +00:03:25,980 --> 00:03:28,560 +to completely design your tokenizer. + +60 +00:03:28,560 --> 00:03:31,683 +But it requires more +experience and attention. + +61 +00:03:33,750 --> 00:03:36,660 +Once the architecture +is chosen, you can thus + +62 +00:03:36,660 --> 00:03:39,513 +train this tokenizer on +your constituted corpus. + +63 +00:03:40,650 --> 00:03:43,440 +Finally, the last thing that +you need to do is to save + +64 +00:03:43,440 --> 00:03:46,443 +the learned rules to be +able to use this tokenizer. + +65 +00:03:49,530 --> 00:03:51,330 +Let's take an example. + +66 +00:03:51,330 --> 00:03:54,873 +Let's say you want to train +a GPT-2 model on Python code. + +67 +00:03:56,160 --> 00:03:59,640 +Even if the Python code +is usually in English + +68 +00:03:59,640 --> 00:04:02,386 +this type of text is very specific, + +69 +00:04:02,386 --> 00:04:04,473 +and deserves a tokenizer trained on it. + +70 +00:04:05,340 --> 00:04:07,980 +To convince you of this, +we will see at the end + +71 +00:04:07,980 --> 00:04:10,023 +the difference produced on an example. + +72 +00:04:11,400 --> 00:04:13,747 +For that we are going to use the method + +73 +00:04:13,747 --> 00:04:18,240 +"train_new_from_iterator" +that all the fast tokenizers + +74 +00:04:18,240 --> 00:04:20,040 +of the library have and thus, + +75 +00:04:20,040 --> 00:04:22,503 +in particular GPT2TokenizerFast. + +76 +00:04:23,880 --> 00:04:26,100 +This is the simplest method in our case + +77 +00:04:26,100 --> 00:04:28,983 +to have a tokenizer +adapted to Python code. + +78 +00:04:30,180 --> 00:04:34,140 +Remember, the first thing is +to gather a training corpus. + +79 +00:04:34,140 --> 00:04:37,320 +We will use a subpart of +the CodeSearchNet dataset + +80 +00:04:37,320 --> 00:04:39,360 +containing only Python functions + +81 +00:04:39,360 --> 00:04:42,360 +from open source libraries on Github. + +82 +00:04:42,360 --> 00:04:43,650 +It's good timing. + +83 +00:04:43,650 --> 00:04:46,980 +This dataset is known +by the datasets library + +84 +00:04:46,980 --> 00:04:49,203 +and we can load it in two lines of code. + +85 +00:04:50,760 --> 00:04:55,230 +Then, as the "train_new_from_iterator" +method expects + +86 +00:04:55,230 --> 00:04:57,150 +a iterator of lists of texts, + +87 +00:04:57,150 --> 00:04:59,970 +we create the +"get_training_corpus" function, + +88 +00:04:59,970 --> 00:05:01,743 +which will return an iterator. + +89 +00:05:03,870 --> 00:05:05,430 +Now that we have our iterator + +90 +00:05:05,430 --> 00:05:09,630 +on our Python functions +corpus, we can load + +91 +00:05:09,630 --> 00:05:12,351 +the GPT-2 tokenizer architecture. + +92 +00:05:12,351 --> 00:05:16,560 +Here old_tokenizer is not +adapted to our corpus. + +93 +00:05:16,560 --> 00:05:17,700 +But we only need + +94 +00:05:17,700 --> 00:05:20,733 +one more line to train +it on our new corpus. + +95 +00:05:21,780 --> 00:05:24,720 +An argument that is common +to most of the tokenization + +96 +00:05:24,720 --> 00:05:28,980 +algorithms used at the moment +is the size of the vocabulary. + +97 +00:05:28,980 --> 00:05:31,773 +We choose here the value 52,000. + +98 +00:05:32,820 --> 00:05:35,760 +Finally, once the training is finished, + +99 +00:05:35,760 --> 00:05:38,850 +we just have to save our +new tokenizer locally, + +100 +00:05:38,850 --> 00:05:41,730 +or send it to the hub +to be able to reuse it + +101 +00:05:41,730 --> 00:05:43,593 +very easily afterwards. + +102 +00:05:45,270 --> 00:05:48,990 +Finally, let's see together +on an example if it was useful + +103 +00:05:48,990 --> 00:05:53,073 +to re-train a tokenizer +similar to GPT-2 one. + +104 +00:05:55,110 --> 00:05:57,660 +With the original tokenizer of GPT-2 + +105 +00:05:57,660 --> 00:06:00,330 +we see that all spaces are isolated, + +106 +00:06:00,330 --> 00:06:01,920 +and the method name randn, + +107 +00:06:01,920 --> 00:06:04,833 +relatively common in Python +code, is split in two. + +108 +00:06:05,730 --> 00:06:09,060 +With our new tokenizer, +single and double indentations + +109 +00:06:09,060 --> 00:06:10,890 +have been learned and the method randn + +110 +00:06:10,890 --> 00:06:13,770 +is tokenized into one token. + +111 +00:06:13,770 --> 00:06:15,000 +And with that, + +112 +00:06:15,000 --> 00:06:18,123 +you now know how to train +your very own tokenizers now. + +113 +00:06:19,498 --> 00:06:22,165 +(air whooshing) + diff --git a/subtitles/en/43_why-are-fast-tokenizers-called-fast.srt b/subtitles/en/43_why-are-fast-tokenizers-called-fast.srt index 18577f493..0253c1ace 100644 --- a/subtitles/en/43_why-are-fast-tokenizers-called-fast.srt +++ b/subtitles/en/43_why-are-fast-tokenizers-called-fast.srt @@ -1,89 +1,168 @@ -1 -00:00:05,200 --> 00:00:11,600 -Why are fast tokenizers called fast? In this video  -we will see exactly how much faster the so-called   - -2 -00:00:11,600 --> 00:00:16,960 -fast tokenizers are compared to their  -slow counterparts. For this benchmark,   - -3 -00:00:16,960 --> 00:00:22,160 -we will use the GLUE MNLI dataset, which  -contains 432 thousands pairs of texts.   - -4 -00:00:22,880 --> 00:00:27,040 -We will see how long it takes for the  -fast and slow versions of a BERT tokenizer   - -5 -00:00:27,040 --> 00:00:34,080 -to process them all. We define our fast and slow  -tokenizer using the AutoTokenizer API. The fast   - -6 -00:00:34,080 --> 00:00:40,160 -tokenizer is the default (when available), so we  -pass along use_fast=False to define the slow one.   - -7 -00:00:41,200 --> 00:00:45,760 -In a notebook, we can time the execution of a  -cell with the time magic command, like this.   - -8 -00:00:46,560 --> 00:00:50,720 -Processing the whole dataset is four  -times faster with a fast tokenizer.   - -9 -00:00:50,720 --> 00:00:54,960 -That's quicker indeed, but not very impressive  -however. That's because we passed along the   - -10 -00:00:54,960 --> 00:00:59,520 -texts to the tokenizer one at a time. This is  -a common mistake to do with fast tokenizers,   - -11 -00:00:59,520 --> 00:01:04,320 -which are backed by Rust and thus able to  -parallelize the tokenization of multiple texts.   - -12 -00:01:05,120 --> 00:01:09,520 -Passing them only one text at a time is like  -sending a cargo ship between two continents   - -13 -00:01:09,520 --> 00:01:15,600 -with just one container, it's very inefficient.  -To unleash the full speed of our fast tokenizers,   - -14 -00:01:15,600 --> 00:01:20,320 -we need to send them batches of texts, which  -we can do with the batched=True argument   - -15 -00:01:20,320 --> 00:01:26,720 -of the map method. Now those results are  -impressive! The fast tokenizer takes 12 seconds to   - -16 -00:01:26,720 --> 00:01:33,280 -process a dataset that takes 4 minutes to the slow  -tokenizer. Summarizing the results in this table,   - -17 -00:01:33,280 --> 00:01:37,200 -you can see why we have called those  -tokenizers fast. And this is only for   - -18 -00:01:37,200 --> 00:01:48,160 -tokenizing texts. If you ever need to train a  -new tokenizer, they do this very quickly too! +1 +00:00:00,418 --> 00:00:03,251 +(dramatic whoosh) + +2 +00:00:05,340 --> 00:00:08,460 +- Why are fast tokenizers called fast? + +3 +00:00:08,460 --> 00:00:10,950 +In this video, we'll see +exactly how much faster, + +4 +00:00:10,950 --> 00:00:13,800 +also, so-called fast +organizers are compared + +5 +00:00:13,800 --> 00:00:15,153 +to their slow counterparts. + +6 +00:00:16,200 --> 00:00:19,260 +For this benchmark, we'll +use the GLUE MNLI dataset + +7 +00:00:19,260 --> 00:00:23,160 +which contains 432,000 spells of text. + +8 +00:00:23,160 --> 00:00:25,890 +We'll see how long it takes +for the fast and slow versions + +9 +00:00:25,890 --> 00:00:28,143 +of a BERT tokenizer to process them all. + +10 +00:00:29,670 --> 00:00:31,380 +We define our fast and +slow token organizer + +11 +00:00:31,380 --> 00:00:33,717 +using the AutoTokenizer API. + +12 +00:00:33,717 --> 00:00:37,110 +The fast tokenizer is a +default when available. + +13 +00:00:37,110 --> 00:00:40,443 +So we pass along, use_fast=False +to define the slow one. + +14 +00:00:41,430 --> 00:00:43,530 +In a notebook, we can time the execution + +15 +00:00:43,530 --> 00:00:46,800 +of itself with a time +magic command, like this. + +16 +00:00:46,800 --> 00:00:49,350 +Processing the whole data +set is four times faster + +17 +00:00:49,350 --> 00:00:50,970 +with a fast tokenizer. + +18 +00:00:50,970 --> 00:00:54,000 +That quicker indeed, +but not very impressive. + +19 +00:00:54,000 --> 00:00:55,380 +This is because we passed along the texts + +20 +00:00:55,380 --> 00:00:57,240 +to the tokenizer one at a time. + +21 +00:00:57,240 --> 00:00:59,730 +This is a common mistake +to do with fast organizers + +22 +00:00:59,730 --> 00:01:02,550 +which are backed by Rust, +and thus able to prioritize + +23 +00:01:02,550 --> 00:01:05,370 +the tokenization of multiple texts. + +24 +00:01:05,370 --> 00:01:07,290 +Passing them only one text at a time, + +25 +00:01:07,290 --> 00:01:09,720 +is like sending a cargo +ship between two continents + +26 +00:01:09,720 --> 00:01:13,140 +with just one container, +it's very inefficient. + +27 +00:01:13,140 --> 00:01:15,810 +To unleash the full speed +of our fast tokenizers, + +28 +00:01:15,810 --> 00:01:18,840 +we need to send them batches +of texts, which we can do + +29 +00:01:18,840 --> 00:01:21,423 +with the batched=True +argument of the map method. + +30 +00:01:22,620 --> 00:01:25,950 +Now those are impressive +results, so the fast tokenizer + +31 +00:01:25,950 --> 00:01:28,410 +takes 12 second to process +the dataset that takes four + +32 +00:01:28,410 --> 00:01:30,093 +minute to the slow tokenizer. + +33 +00:01:31,440 --> 00:01:33,510 +Summarizing the results in this table, + +34 +00:01:33,510 --> 00:01:36,630 +you can see why we have +called those tokenizers fast. + +35 +00:01:36,630 --> 00:01:38,760 +And this is only for tokenizing texts. + +36 +00:01:38,760 --> 00:01:40,710 +If you ever need to train a new tokenizer, + +37 +00:01:40,710 --> 00:01:42,523 +they do this very quickly too. + diff --git a/subtitles/en/44_fast-tokenizer-superpowers.srt b/subtitles/en/44_fast-tokenizer-superpowers.srt index de81e3a3e..a5453f675 100644 --- a/subtitles/en/44_fast-tokenizer-superpowers.srt +++ b/subtitles/en/44_fast-tokenizer-superpowers.srt @@ -1,200 +1,335 @@ -1 -00:00:05,109 --> 00:00:10,089 -The fast tokenizers of the Transformers library -are fast, but they also implement features - -2 -00:00:10,089 --> 00:00:14,610 -that will be super useful for data pre-processing -and post-processing. - -3 -00:00:14,610 --> 00:00:16,750 -Let's have a look at them! - -4 -00:00:16,750 --> 00:00:21,759 -First, let's have a look at the usual output -of a tokenizer. - -5 -00:00:21,759 --> 00:00:28,039 -We get input IDs that correspond to tokens, -but we lose a lot of information in the process. - -6 -00:00:28,039 --> 00:00:33,270 -For instance, here the tokenization is the -same for the two sentences, even if one has - -7 -00:00:33,270 --> 00:00:36,510 -several more spaces than the other. - -8 -00:00:36,510 --> 00:00:41,090 -Just having the input IDs is thus not enough -if we want to match some tokens with a span - -9 -00:00:41,090 --> 00:00:46,610 -of text (something we will need to do when -tackling question answering for instance). - -10 -00:00:46,610 --> 00:00:51,699 -It's also difficult to know when two tokens -belong to the same word or not: it looks easy - -11 -00:00:51,699 --> 00:00:57,250 -when you just look at the output of a BERT -tokenizer, we just need to look for the ##. But - -12 -00:00:57,250 --> 00:01:01,490 -other tokenizers have different ways to tokenize -parts of words. - -13 -00:01:01,490 --> 00:01:06,910 -For instance RoBERTa adds this special G symbol -to mark the tokens at the beginning of a word, - -14 -00:01:06,910 --> 00:01:12,160 -and T5 uses this special underscore symbol -for the same purpose. - -15 -00:01:12,160 --> 00:01:16,759 -Thankfully, the fast tokenizers keep track -of the word each token comes from, with a - -16 -00:01:16,759 --> 00:01:20,090 -word_ids method you can use on their outputs. - -17 -00:01:20,090 --> 00:01:24,799 -The output is not necessarily clear, but assembled -together in a nice table like this, we can - -18 -00:01:24,799 --> 00:01:28,119 -look at the word position for each token. - -19 -00:01:28,119 --> 00:01:32,659 -Even better, the fast tokenizers keep track -of the span of characters each token comes - -20 -00:01:32,659 --> 00:01:38,780 -from, and we can get them when calling it -on one (or several) text by adding the return_offsets_mapping=True - -21 -00:01:38,780 --> 00:01:39,780 -argument. - -22 -00:01:39,780 --> 00:01:46,469 -In this instance, we can see how we jump positions -between the ##s token and the super token, - -23 -00:01:46,469 --> 00:01:50,579 -because of the multiple spaces in the initial -sentence. - -24 -00:01:50,579 --> 00:01:54,470 -To enable this, the fast tokenizers store -additional information at each step of their - -25 -00:01:54,470 --> 00:01:55,470 -internal pipeline. - -26 -00:01:55,470 --> 00:02:00,899 -That internal pipeline consists of normalization, -where we apply some cleaning to the text, - -27 -00:02:00,899 --> 00:02:05,600 -like lowercasing or removing the accents;() -pre-tokenization, which is where we split - -28 -00:02:05,600 --> 00:02:09,940 -the texts into words;() then we apply the -model of the tokenizer, which is where the - -29 -00:02:09,940 --> 00:02:15,300 -words are splits into tokens,() before finally -doing the post-processing, where special tokens - -30 -00:02:15,300 --> 00:02:17,110 -are added. - -31 -00:02:17,110 --> 00:02:20,730 -From the beginning to the end of the pipeline, -the tokenizer keeps track of each span of - -32 -00:02:20,730 --> 00:02:23,680 -text that corresponds to each word, then each -token. - -33 -00:02:23,680 --> 00:02:29,099 -We will see how useful it is when we tackle -the following tasks: when doing masked language - -34 -00:02:29,099 --> 00:02:34,360 -modeling, one variation that gets state-of-the-art -results is to mask all the tokens of a given - -35 -00:02:34,360 --> 00:02:37,600 -word instead of randomly chosen tokens. - -36 -00:02:37,600 --> 00:02:40,909 -This will require us to use the word IDs we -saw. - -37 -00:02:40,909 --> 00:02:45,209 -When doing token classification, we'll need -to convert the labels we have on words, to - -38 -00:02:45,209 --> 00:02:47,230 -labels on each tokens. - -39 -00:02:47,230 --> 00:02:51,360 -As for the offset mappings, it will be super -useful when we need to convert token positions - -40 -00:02:51,360 --> 00:02:56,330 -in a sentence into a span of text, which we -will need to know when looking at question - -41 -00:02:56,330 --> 00:03:01,200 -answering or when grouping the tokens corresponding -to the same entity in token classification. - -42 -00:03:01,200 --> 00:03:09,730 -To have a look at these tasks, check the videos -linked below! +1 +00:00:05,010 --> 00:00:06,270 +- The fast tokenizers + +2 +00:00:06,270 --> 00:00:08,580 +of the Transformers library are fast, + +3 +00:00:08,580 --> 00:00:11,490 +but they also implement features +that will be super useful + +4 +00:00:11,490 --> 00:00:14,536 +for data pre-processing +and post-processing. + +5 +00:00:14,536 --> 00:00:17,239 +Let's have a look at them! + +6 +00:00:17,239 --> 00:00:18,650 +First, let's have a look + +7 +00:00:18,650 --> 00:00:21,690 +at the usual output of a tokenizer. + +8 +00:00:21,690 --> 00:00:24,278 +We get input IDs that correspond to token, + +9 +00:00:24,278 --> 00:00:27,960 +but we lose a lot of +information in the process. + +10 +00:00:27,960 --> 00:00:29,010 +For instance, + +11 +00:00:29,010 --> 00:00:31,856 +here the tokenization is the +same for the two sentences + +12 +00:00:31,856 --> 00:00:35,373 +even if one has several +more spaces than the other. + +13 +00:00:36,300 --> 00:00:39,150 +Just having the input +IDs is thus not enough + +14 +00:00:39,150 --> 00:00:42,330 +if we want to match some +tokens with a span of text, + +15 +00:00:42,330 --> 00:00:43,320 +something we'll need to do + +16 +00:00:43,320 --> 00:00:46,111 +when tackling question +answering, for instance. + +17 +00:00:46,111 --> 00:00:47,592 +It's also difficult to know + +18 +00:00:47,592 --> 00:00:50,850 +when two tokens belong +to the same word or not. + +19 +00:00:50,850 --> 00:00:52,860 +It looks easy when you +just look at the output + +20 +00:00:52,860 --> 00:00:55,650 +of a BERT tokenizer where +we just need to look + +21 +00:00:55,650 --> 00:00:56,779 +for the hash hash. + +22 +00:00:56,779 --> 00:00:59,040 +But other tokenizers have different ways + +23 +00:00:59,040 --> 00:01:00,987 +to tokenize parts of words. + +24 +00:01:00,987 --> 00:01:04,470 +For instance, RoBERTa +adds this special G symbol + +25 +00:01:04,470 --> 00:01:06,491 +to mark the tokens at +the beginning of the word + +26 +00:01:06,491 --> 00:01:09,570 +and T5 uses this special underscore symbol + +27 +00:01:09,570 --> 00:01:11,150 +for the same purpose. + +28 +00:01:11,150 --> 00:01:14,760 +Thankfully, the fast tokenizers +keep track of the word + +29 +00:01:14,760 --> 00:01:16,230 +each token comes from, + +30 +00:01:16,230 --> 00:01:19,571 +with a word_ids method you +can use on their outputs. + +31 +00:01:19,571 --> 00:01:21,870 +The output is not necessarily clear, + +32 +00:01:21,870 --> 00:01:24,076 +but assembled together in +a nice table like this, + +33 +00:01:24,076 --> 00:01:26,853 +we can look at the word +position for each token. + +34 +00:01:27,930 --> 00:01:30,220 +Even better, the fast +tokenizers keep track + +35 +00:01:30,220 --> 00:01:33,198 +of the span of characters +each token comes from, + +36 +00:01:33,198 --> 00:01:35,760 +and we can get them when calling it on one + +37 +00:01:35,760 --> 00:01:37,221 +or several text by adding + +38 +00:01:37,221 --> 00:01:40,470 +the return_offsets_mapping=True argument. + +39 +00:01:40,470 --> 00:01:42,312 +In this instance, we can +see how we jump positions + +40 +00:01:42,312 --> 00:01:45,650 +between the hash hash +token and the super token, + +41 +00:01:45,650 --> 00:01:49,992 +because of the multiple spaces +in the initial sentence. + +42 +00:01:49,992 --> 00:01:52,110 +To enable this, the fast tokenizers + +43 +00:01:52,110 --> 00:01:54,270 +store additional information at each step + +44 +00:01:54,270 --> 00:01:55,440 +of their internal pipeline. + +45 +00:01:55,440 --> 00:01:57,951 +That internal pipeline +consists of normalization, + +46 +00:01:57,951 --> 00:02:00,360 +where we apply some cleaning to the text, + +47 +00:02:00,360 --> 00:02:02,621 +like lower casing or removing the accents; + +48 +00:02:02,621 --> 00:02:04,088 +pre-tokenization, + +49 +00:02:04,088 --> 00:02:06,530 +which is where we split +the texts into words; + +50 +00:02:06,530 --> 00:02:09,360 +then we apply the model of the tokenizer, + +51 +00:02:09,360 --> 00:02:11,725 +which is where the words +are split into tokens, + +52 +00:02:11,725 --> 00:02:13,748 +before finally doing the post processing, + +53 +00:02:13,748 --> 00:02:16,023 +where special tokens are added. + +54 +00:02:17,100 --> 00:02:19,050 +From the beginning to +the end of the pipeline, + +55 +00:02:19,050 --> 00:02:21,390 +the tokenizer keeps track +of each span of text + +56 +00:02:21,390 --> 00:02:23,853 +that corresponds to each +word, then each token. + +57 +00:02:24,990 --> 00:02:26,100 +We'll see how useful it is + +58 +00:02:26,100 --> 00:02:27,990 +when we tackle the following tasks: + +59 +00:02:27,990 --> 00:02:29,549 +when doing masked language modeling + +60 +00:02:29,549 --> 00:02:32,407 +one variation that gets +state-of-the-art results + +61 +00:02:32,407 --> 00:02:35,040 +is to mask all the tokens of a given word + +62 +00:02:35,040 --> 00:02:37,440 +instead of randomly chosen words. + +63 +00:02:37,440 --> 00:02:40,800 +This will require us to +use the word IDs we saw. + +64 +00:02:40,800 --> 00:02:42,329 +When doing token classification, + +65 +00:02:42,329 --> 00:02:45,090 +we'll need to convert the +labels we have on words, + +66 +00:02:45,090 --> 00:02:47,250 +to labels on each tokens. + +67 +00:02:47,250 --> 00:02:48,480 +As for the offset mappings, + +68 +00:02:48,480 --> 00:02:50,610 +it will be super useful +when we need to convert + +69 +00:02:50,610 --> 00:02:53,436 +token positions in a +sentence into a span of text, + +70 +00:02:53,436 --> 00:02:55,800 +which we'll need to +know when we're looking + +71 +00:02:55,800 --> 00:02:56,813 +at question answering + +72 +00:02:56,813 --> 00:02:58,680 +or when grouping the tokens corresponding + +73 +00:02:58,680 --> 00:03:01,023 +to the same entity in +token classification. + +74 +00:03:02,160 --> 00:03:03,450 +To have a look at these tasks, + +75 +00:03:03,450 --> 00:03:04,950 +check the videos linked below! + diff --git a/subtitles/en/45_inside-the-token-classification-pipeline-(pytorch).srt b/subtitles/en/45_inside-the-token-classification-pipeline-(pytorch).srt index f303bdfe9..b248a28c8 100644 --- a/subtitles/en/45_inside-the-token-classification-pipeline-(pytorch).srt +++ b/subtitles/en/45_inside-the-token-classification-pipeline-(pytorch).srt @@ -1,154 +1,333 @@ -1 -00:00:05,200 --> 00:00:08,080 -Let's have a look inside the  -token classification pipeline.   - -2 -00:00:10,000 --> 00:00:13,920 -In the pipeline video, we looked at the  -different applications the Transformers   - -3 -00:00:13,920 --> 00:00:19,840 -library supports out of the box, one of them being  -token classification, for instance predicting for   - -4 -00:00:19,840 --> 00:00:24,960 -each word in a sentence whether they correspond  -to a person, an organization or a location.   - -5 -00:00:26,400 --> 00:00:30,240 -We can even group together the tokens  -corresponding to the same entity,   - -6 -00:00:30,240 --> 00:00:34,960 -for instance all the tokens that formed  -the word Sylvain here, or Hugging and Face.   - -7 -00:00:36,960 --> 00:00:42,480 -The token classification pipeline works the same  -way as the text classification pipeline we studied   - -8 -00:00:42,480 --> 00:00:49,360 -in a previous video. There are three steps: the  -tokenization, the model, and the post processing.   - -9 -00:00:50,720 --> 00:00:55,680 -The first two steps are identical to the text  -classification pipeline, except we use an auto   - -10 -00:00:55,680 --> 00:01:01,760 -token classification model instead of a sequence  -classification one. We tokenize our text then feed   - -11 -00:01:01,760 --> 00:01:07,360 -it to the model. Instead of getting one number  -for each possible label for the whole sentence,   - -12 -00:01:07,360 --> 00:01:13,760 -we get one number for each of the possible 9  -labels for every token in the sentence, here 19.   - -13 -00:01:15,120 --> 00:01:19,600 -Like all the other models of the Transformers  -library, our model outputs logits,   - -14 -00:01:19,600 --> 00:01:26,160 -which we turn into predictions by using a SoftMax.  -We also get the predicted label for each token by   - -15 -00:01:26,160 --> 00:01:30,000 -taking the maximum prediction (since the softmax  -function preserves the order, we could have   - -16 -00:01:30,000 --> 00:01:35,200 -done it on the logits if we had no need of the  -predictions). The model config contains the label   - -17 -00:01:35,200 --> 00:01:41,200 -mapping in its id2label field. Using it, we can  -map every token to its corresponding label. The   - -18 -00:01:41,200 --> 00:01:46,400 -label O correspond to "no entity", which is why we  -didn't see it in our results in the first slide.   - -19 -00:01:47,040 --> 00:01:51,360 -On top of the label and the probability,  -those results included the start and end   - -20 -00:01:51,360 --> 00:01:56,960 -character in the sentence. We will need to use the  -offset mapping of the tokenizer to get those (look   - -21 -00:01:56,960 --> 00:02:02,080 -at the video linked below if you don't know about  -them already). Then, looping through each token   - -22 -00:02:02,080 --> 00:02:08,240 -that has a label distinct from O, we can build the  -list of results we got with our first pipeline.   - -23 -00:02:08,240 --> 00:02:13,360 -The last step is to group together tokens  -that correspond to the same entity.This   - -24 -00:02:13,360 --> 00:02:17,680 -is why we had two labels for each type  -of entity: I-PER and B-PER for instance.   - -25 -00:02:18,240 --> 00:02:21,840 -It allows us to know if a token is in  -the same entity as the previous one.()   - -26 -00:02:23,120 --> 00:02:26,720 -Note that there are two ways of  -labelling used for token classification,   - -27 -00:02:26,720 --> 00:02:31,680 -one (in pink here) uses the B-PER label at the  -beginning of each new entity, but the other   - -28 -00:02:31,680 --> 00:02:38,320 -(in blue) only uses it to separate two adjacent  -entities of the same type. In both cases, we can   - -29 -00:02:38,320 --> 00:02:44,720 -flag a new entity each time we see a new label  -appearing (either with the I or B prefix) then   - -30 -00:02:44,720 --> 00:02:50,160 -take all the following tokens labelled the same,  -with an I-flag. This, coupled with the offset   - -31 -00:02:50,160 --> 00:03:01,040 -mapping to get the start and end characters allows  -us to get the span of texts for each entity. +1 +00:00:00,076 --> 00:00:01,462 +(title whooshes) + +2 +00:00:01,462 --> 00:00:02,382 +(logo pops) + +3 +00:00:02,382 --> 00:00:05,340 +(title whooshes) + +4 +00:00:05,340 --> 00:00:06,210 +- Let's have a look + +5 +00:00:06,210 --> 00:00:08,283 +inside the token classification pipeline. + +6 +00:00:10,080 --> 00:00:11,580 +In the pipeline video, + +7 +00:00:11,580 --> 00:00:13,320 +we looked at the different applications + +8 +00:00:13,320 --> 00:00:15,960 +the Transformers library +supports out of the box, + +9 +00:00:15,960 --> 00:00:18,780 +one of them being token classification, + +10 +00:00:18,780 --> 00:00:21,810 +for instance predicting +for each word in a sentence + +11 +00:00:21,810 --> 00:00:24,510 +whether they correspond to +a person, an organization + +12 +00:00:24,510 --> 00:00:25,353 +or a location. + +13 +00:00:26,670 --> 00:00:28,920 +We can even group together +the tokens corresponding + +14 +00:00:28,920 --> 00:00:32,040 +to the same entity, for +instance all the tokens + +15 +00:00:32,040 --> 00:00:35,373 +that formed the word Sylvain +here, or Hugging and Face. + +16 +00:00:37,290 --> 00:00:40,230 +The token classification +pipeline works the same way + +17 +00:00:40,230 --> 00:00:42,630 +as the text classification +pipeline we studied + +18 +00:00:42,630 --> 00:00:44,430 +in the previous video. + +19 +00:00:44,430 --> 00:00:45,930 +There are three steps. + +20 +00:00:45,930 --> 00:00:49,623 +The tokenization, the model, +and the postprocessing. + +21 +00:00:50,940 --> 00:00:52,530 +The first two steps are identical + +22 +00:00:52,530 --> 00:00:54,630 +to the text classification pipeline, + +23 +00:00:54,630 --> 00:00:57,300 +except we use an auto +token classification model + +24 +00:00:57,300 --> 00:01:00,150 +instead of a sequence classification one. + +25 +00:01:00,150 --> 00:01:03,720 +We tokenize our text then +feed it to the model. + +26 +00:01:03,720 --> 00:01:05,877 +Instead of getting one number +for each possible label + +27 +00:01:05,877 --> 00:01:08,700 +for the whole sentence, we get one number + +28 +00:01:08,700 --> 00:01:10,770 +for each of the possible nine labels + +29 +00:01:10,770 --> 00:01:13,983 +for every token in the sentence, here 19. + +30 +00:01:15,300 --> 00:01:18,090 +Like all the other models +of the Transformers library, + +31 +00:01:18,090 --> 00:01:19,830 +our model outputs logits, + +32 +00:01:19,830 --> 00:01:23,073 +which we turn into predictions +by using a SoftMax. + +33 +00:01:23,940 --> 00:01:26,190 +We also get the predicted +label for each token + +34 +00:01:26,190 --> 00:01:27,990 +by taking the maximum prediction, + +35 +00:01:27,990 --> 00:01:29,880 +since the SoftMax function +preserves the orders, + +36 +00:01:29,880 --> 00:01:31,200 +we could have done it on the logits + +37 +00:01:31,200 --> 00:01:33,050 +if we had no need of the predictions. + +38 +00:01:33,930 --> 00:01:35,880 +The model config contains +the label mapping + +39 +00:01:35,880 --> 00:01:37,740 +in its id2label field. + +40 +00:01:37,740 --> 00:01:41,430 +Using it, we can map every token +to its corresponding label. + +41 +00:01:41,430 --> 00:01:43,950 +The label, O, correspond to no entity, + +42 +00:01:43,950 --> 00:01:45,985 +which is why we didn't +see it in our results + +43 +00:01:45,985 --> 00:01:47,547 +in the first slide. + +44 +00:01:47,547 --> 00:01:49,440 +On top of the label and the probability, + +45 +00:01:49,440 --> 00:01:51,000 +those results included the start + +46 +00:01:51,000 --> 00:01:53,103 +and end character in the sentence. + +47 +00:01:54,120 --> 00:01:55,380 +We'll need to use the offset mapping + +48 +00:01:55,380 --> 00:01:56,640 +of the tokenizer to get those. + +49 +00:01:56,640 --> 00:01:58,050 +Look at the video linked below + +50 +00:01:58,050 --> 00:02:00,300 +if you don't know about them already. + +51 +00:02:00,300 --> 00:02:02,280 +Then, looping through each token + +52 +00:02:02,280 --> 00:02:04,080 +that has a label distinct from O, + +53 +00:02:04,080 --> 00:02:06,120 +we can build the list of results we got + +54 +00:02:06,120 --> 00:02:07,320 +with our first pipeline. + +55 +00:02:08,460 --> 00:02:10,560 +The last step is to group together tokens + +56 +00:02:10,560 --> 00:02:12,310 +that correspond to the same entity. + +57 +00:02:13,264 --> 00:02:16,140 +This is why we had two labels +for each type of entity, + +58 +00:02:16,140 --> 00:02:18,450 +I-PER and B-PER, for instance. + +59 +00:02:18,450 --> 00:02:20,100 +It allows us to know if a token is + +60 +00:02:20,100 --> 00:02:22,323 +in the same entity as the previous one. + +61 +00:02:23,310 --> 00:02:25,350 +Note, that there are two +ways of labeling used + +62 +00:02:25,350 --> 00:02:26,850 +for token classification. + +63 +00:02:26,850 --> 00:02:29,420 +One, in pink here, uses the B-PER label + +64 +00:02:29,420 --> 00:02:30,810 +at the beginning of each new entity, + +65 +00:02:30,810 --> 00:02:32,760 +but the other, in blue, + +66 +00:02:32,760 --> 00:02:35,340 +only uses it to separate +two adjacent entities + +67 +00:02:35,340 --> 00:02:37,140 +of the same type. + +68 +00:02:37,140 --> 00:02:39,690 +In both cases, we can flag a new entity + +69 +00:02:39,690 --> 00:02:41,940 +each time we see a new label appearing, + +70 +00:02:41,940 --> 00:02:44,730 +either with the I or B prefix, + +71 +00:02:44,730 --> 00:02:47,130 +then take all the following +tokens labeled the same, + +72 +00:02:47,130 --> 00:02:48,870 +with an I-flag. + +73 +00:02:48,870 --> 00:02:51,330 +This, coupled with the offset +mapping to get the start + +74 +00:02:51,330 --> 00:02:54,210 +and end characters allows +us to get the span of texts + +75 +00:02:54,210 --> 00:02:55,233 +for each entity. + +76 +00:02:56,569 --> 00:02:59,532 +(title whooshes) + +77 +00:02:59,532 --> 00:03:01,134 +(title fizzles) + diff --git a/subtitles/en/46_inside-the-token-classification-pipeline-(tensorflow).srt b/subtitles/en/46_inside-the-token-classification-pipeline-(tensorflow).srt index 1f7367a85..f628d7744 100644 --- a/subtitles/en/46_inside-the-token-classification-pipeline-(tensorflow).srt +++ b/subtitles/en/46_inside-the-token-classification-pipeline-(tensorflow).srt @@ -1,189 +1,320 @@ -1 -00:00:05,250 --> 00:00:09,840 -Let's have a look inside the token classification -pipeline. - -2 -00:00:09,840 --> 00:00:14,690 -In the pipeline video, we looked at the different -applications the Transformers library supports - -3 -00:00:14,690 --> 00:00:20,820 -out of the box, one of them being token classification, -for instance predicting for each word in a - -4 -00:00:20,820 --> 00:00:27,760 -sentence whether they correspond to a person, -an organization or a location. - -5 -00:00:27,760 --> 00:00:32,660 -We can even group together the tokens corresponding -to the same entity, for instance all the tokens - -6 -00:00:32,660 --> 00:00:37,950 -that formed the word Sylvain here, or Hugging -and Face. - -7 -00:00:37,950 --> 00:00:42,480 -The token classification pipeline works the -same way as the text classification pipeline - -8 -00:00:42,480 --> 00:00:44,379 -we studied in a previous video. - -9 -00:00:44,379 --> 00:00:49,600 -There are three steps: the tokenization, the -model, and the post processing. - -10 -00:00:49,600 --> 00:00:56,340 -The first two steps are identical to the text -classification pipeline, except we use an - -11 -00:00:56,340 --> 00:01:01,640 -auto token classification model instead of -a sequence classification one. - -12 -00:01:01,640 --> 00:01:05,840 -We tokenize our text then feed it to the model. - -13 -00:01:05,840 --> 00:01:10,400 -Instead of getting one number for each possible -label for the whole sentence, we get one number - -14 -00:01:10,400 --> 00:01:16,690 -for each of the possible 9 labels for every -token in the sentence, here 19. - -15 -00:01:16,690 --> 00:01:22,299 -Like all the other models of the Transformers -library, our model outputs logits, which we - -16 -00:01:22,299 --> 00:01:25,900 -turn into predictions by using a SoftMax. - -17 -00:01:25,900 --> 00:01:31,430 -We also get the predicted label for each token -by taking the maximum prediction (since the - -18 -00:01:31,430 --> 00:01:35,470 -softmax function preserves the order, we could -have done it on the logits if we had no need - -19 -00:01:35,470 --> 00:01:37,759 -of the predictions). - -20 -00:01:37,759 --> 00:01:42,340 -The model config contains the label mapping -in its id2label field. - -21 -00:01:42,340 --> 00:01:45,331 -Using it, we can map every token to its corresponding -label. - -22 -00:01:45,331 --> 00:01:50,490 -The label O correspond to "no entity", which -is why we didn't see it in our results in - -23 -00:01:50,490 --> 00:01:51,579 -the first slide. - -24 -00:01:51,579 --> 00:01:57,430 -On top of the label and the probability, those -results included the start and end character - -25 -00:01:57,430 --> 00:01:58,570 -in the sentence. - -26 -00:01:58,570 --> 00:02:02,610 -We will need to use the offset mapping of -the tokenizer to get those (look at the video - -27 -00:02:02,610 --> 00:02:04,820 -linked below if you don't know about them -already). - -28 -00:02:04,820 --> 00:02:09,750 -Then, looping through each token that has -a label distinct from O, we can build the - -29 -00:02:09,750 --> 00:02:12,440 -list of results we got with our first pipeline. - -30 -00:02:12,440 --> 00:02:18,920 -The last step is to group together tokens -that correspond to the same entity. - -31 -00:02:18,920 --> 00:02:23,290 -This is why we had two labels for each type -of entity: I-PER and B-PER for instance. - -32 -00:02:23,290 --> 00:02:29,190 -It allows us to know if a token is in the -same entity as the previous one.() Note that - -33 -00:02:29,190 --> 00:02:34,750 -there are two ways of labelling used for token -classification, one (in pink here) uses the - -34 -00:02:34,750 --> 00:02:40,380 -B-PER label at the beginning of each new entity, -but the other (in blue) only uses it to separate - -35 -00:02:40,380 --> 00:02:43,380 -two adjacent entities of the same type. - -36 -00:02:43,380 --> 00:02:48,880 -In both cases, we can flag a new entity each -time we see a new label appearing (either - -37 -00:02:48,880 --> 00:02:54,051 -with the I or B prefix) then take all the -following tokens labelled the same, with an - -38 -00:02:54,051 --> 00:02:55,051 -I-flag. - -39 -00:02:55,051 --> 00:02:59,360 -This, coupled with the offset mapping to get -the start and end characters allows us to - -40 -00:02:59,360 --> 00:03:06,500 -get the span of texts for each entity. +1 +00:00:00,180 --> 00:00:03,013 +(whooshing sound) + +2 +00:00:05,310 --> 00:00:06,143 +- Let's have a look + +3 +00:00:06,143 --> 00:00:08,133 +inside the token classification pipeline. + +4 +00:00:09,780 --> 00:00:11,430 +In the pipeline video, + +5 +00:00:11,430 --> 00:00:13,230 +we looked at the different applications + +6 +00:00:13,230 --> 00:00:16,050 +the Transformers library +supports out of the box. + +7 +00:00:16,050 --> 00:00:18,660 +One of them being token classification. + +8 +00:00:18,660 --> 00:00:22,050 +For instance, predicting +for each word in a sentence, + +9 +00:00:22,050 --> 00:00:23,790 +whether they correspond to a person, + +10 +00:00:23,790 --> 00:00:26,043 +an organization, or location. + +11 +00:00:27,690 --> 00:00:29,250 +We can even group together the tokens + +12 +00:00:29,250 --> 00:00:31,320 +corresponding to the same entity. + +13 +00:00:31,320 --> 00:00:34,890 +For instance, all the tokens +that form the word Sylvain here + +14 +00:00:34,890 --> 00:00:36,423 +or Hugging and Face. + +15 +00:00:37,320 --> 00:00:39,720 +So, token classification pipeline + +16 +00:00:39,720 --> 00:00:42,480 +works the same way as a +text classification pipeline + +17 +00:00:42,480 --> 00:00:44,910 +we studied in a previous video. + +18 +00:00:44,910 --> 00:00:46,500 +There are three steps. + +19 +00:00:46,500 --> 00:00:50,043 +Tokenization, the model, +and the post processing. + +20 +00:00:51,690 --> 00:00:53,190 +The first two steps are identical + +21 +00:00:53,190 --> 00:00:55,230 +to the text classification pipeline, + +22 +00:00:55,230 --> 00:00:58,230 +except we use an auto +token classification model + +23 +00:00:58,230 --> 00:01:00,303 +instead of a sequence classification one. + +24 +00:01:01,560 --> 00:01:04,593 +We tokenize our text, +then feed it to the model. + +25 +00:01:05,580 --> 00:01:08,160 +Instead of getting one number +for each possible level + +26 +00:01:08,160 --> 00:01:09,600 +for the whole sentence, + +27 +00:01:09,600 --> 00:01:12,270 +we get one number for each +of the possible nine levels + +28 +00:01:12,270 --> 00:01:14,250 +for every token in the sentence. + +29 +00:01:14,250 --> 00:01:15,573 +Here, 19. + +30 +00:01:17,070 --> 00:01:19,710 +Like all the other models +of the Transformers library, + +31 +00:01:19,710 --> 00:01:22,560 +our model outputs logits +which we need to turn + +32 +00:01:22,560 --> 00:01:24,663 +into predictions by using a SoftMax. + +33 +00:01:25,830 --> 00:01:28,170 +We also get the predicted +label for each token + +34 +00:01:28,170 --> 00:01:30,063 +by taking the maximum prediction. + +35 +00:01:31,080 --> 00:01:33,540 +Since the softmax function +preserves the order, + +36 +00:01:33,540 --> 00:01:34,980 +we could have done it on the logits + +37 +00:01:34,980 --> 00:01:36,830 +if we had no need of the predictions. + +38 +00:01:37,680 --> 00:01:40,050 +The model config contains +the label mapping + +39 +00:01:40,050 --> 00:01:42,090 +in its id2label field. + +40 +00:01:42,090 --> 00:01:45,600 +Using it, we can map every token +to its corresponding label. + +41 +00:01:45,600 --> 00:01:48,630 +The label O corresponds to "no entity" + +42 +00:01:48,630 --> 00:01:50,460 +which is why we didn't +see it in our results + +43 +00:01:50,460 --> 00:01:52,110 +in the first slide. + +44 +00:01:52,110 --> 00:01:54,150 +On top of the label and the probability, + +45 +00:01:54,150 --> 00:01:55,620 +those results included the start + +46 +00:01:55,620 --> 00:01:57,423 +and end character in the sentence. + +47 +00:01:58,294 --> 00:01:59,880 +We'll need to use the offset mapping + +48 +00:01:59,880 --> 00:02:01,110 +of the tokenizer to get those. + +49 +00:02:01,110 --> 00:02:03,090 +Look at the video link below + +50 +00:02:03,090 --> 00:02:05,340 +if you don't know about them already. + +51 +00:02:05,340 --> 00:02:06,990 +Then, looping through each token + +52 +00:02:06,990 --> 00:02:09,090 +that has a label distinct from O, + +53 +00:02:09,090 --> 00:02:10,590 +we can build the list of results + +54 +00:02:10,590 --> 00:02:12,140 +we got with our first pipeline. + +55 +00:02:13,650 --> 00:02:15,840 +The last step is to group together tokens + +56 +00:02:15,840 --> 00:02:17,640 +that corresponds to the same entity. + +57 +00:02:18,930 --> 00:02:21,540 +This is why we had two labels +for each type of entity, + +58 +00:02:21,540 --> 00:02:23,940 +I-PER and B-PER for instance. + +59 +00:02:23,940 --> 00:02:25,530 +It allows us to know if a token + +60 +00:02:25,530 --> 00:02:27,603 +is in the same entity as a previous one. + +61 +00:02:28,620 --> 00:02:29,850 +Note that there are two ways + +62 +00:02:29,850 --> 00:02:32,490 +of labeling used for token classification. + +63 +00:02:32,490 --> 00:02:35,360 +One, in pink here, uses the B-PER label + +64 +00:02:35,360 --> 00:02:37,530 +at the beginning of each new entity. + +65 +00:02:37,530 --> 00:02:39,990 +But the other in blue only uses it + +66 +00:02:39,990 --> 00:02:42,933 +to separate two adjacent +entities of the same types. + +67 +00:02:44,340 --> 00:02:46,560 +In both cases we can flag a new entity + +68 +00:02:46,560 --> 00:02:49,110 +each time we see a new label appearing, + +69 +00:02:49,110 --> 00:02:51,330 +either with the I or B prefix. + +70 +00:02:51,330 --> 00:02:53,850 +Then, take all the following +tokens labeled the same + +71 +00:02:53,850 --> 00:02:55,470 +with an I-flag. + +72 +00:02:55,470 --> 00:02:57,000 +This, coupled with the offset mapping + +73 +00:02:57,000 --> 00:02:59,010 +to get the start and end characters + +74 +00:02:59,010 --> 00:03:01,560 +allows us to get the span +of texts for each entity. + +75 +00:03:02,869 --> 00:03:05,702 +(whooshing sound) + diff --git a/subtitles/en/47_inside-the-question-answering-pipeline-(pytorch).srt b/subtitles/en/47_inside-the-question-answering-pipeline-(pytorch).srt index 02844ec4c..135185eff 100644 --- a/subtitles/en/47_inside-the-question-answering-pipeline-(pytorch).srt +++ b/subtitles/en/47_inside-the-question-answering-pipeline-(pytorch).srt @@ -1,225 +1,342 @@ -1 -00:00:04,130 --> 00:00:08,390 -Let's have a look inside the question answering -pipeline. - -2 -00:00:08,390 --> 00:00:12,630 -The question answering pipeline can extracts -answers to questions from a given context - -3 -00:00:12,630 --> 00:00:18,150 -or passage of text, like this part of the -Transformers repo README. - -4 -00:00:18,150 --> 00:00:22,440 -It also works for very long contexts, even -if the answer is at the very end, like in - -5 -00:00:22,440 --> 00:00:23,440 -this example. - -6 -00:00:23,440 --> 00:00:24,680 -In this video, we will see why! - -7 -00:00:24,680 --> 00:00:31,540 -The question answering pipeline follows the -same steps as the other pipelines: the question - -8 -00:00:31,540 --> 00:00:36,380 -and context are tokenized as a sentence pair, -fed to the model then some post-processing - -9 -00:00:36,380 --> 00:00:37,649 -is applied. - -10 -00:00:37,649 --> 00:00:41,790 -The tokenization and model steps should be -familiar. - -11 -00:00:41,790 --> 00:00:47,020 -We use the auto class suitable for Question -Answering instead of sequence classification, - -12 -00:00:47,020 --> 00:00:52,039 -but one key difference with text classification -is that our model outputs two tensors named - -13 -00:00:52,039 --> 00:00:54,559 -start logits and end logits. - -14 -00:00:54,559 --> 00:00:55,559 -Why is that? - -15 -00:00:55,559 --> 00:00:59,850 -Well this is the way the model finds the answer -to the question. - -16 -00:00:59,850 --> 00:01:02,270 -First let's have a look at the model inputs. - -17 -00:01:02,270 --> 00:01:07,160 -It's numbers associated with the tokenization -of the question followed by the context (with - -18 -00:01:07,160 --> 00:01:10,710 -the usual CLS and SEP special tokens). - -19 -00:01:10,710 --> 00:01:13,310 -The answer is a part of those tokens. - -20 -00:01:13,310 --> 00:01:17,759 -So we ask the model to predict which token -starts the answer and which ends the answer. - -21 -00:01:17,759 --> 00:01:24,380 -For our two logit outputs, the theoretical -labels are the pink and purple vectors. - -22 -00:01:24,380 --> 00:01:28,360 -To convert those logits into probabilities, -we will need to apply a SoftMax, like in the - -23 -00:01:28,360 --> 00:01:30,439 -text classification pipeline. - -24 -00:01:30,439 --> 00:01:35,070 -We just mask the tokens that are not part -of the context before doing that, leaving - -25 -00:01:35,070 --> 00:01:41,009 -the initial CLS token unmasked as we use it -to predict an impossible answer. - -26 -00:01:41,009 --> 00:01:43,579 -This is what it looks in terms of code. - -27 -00:01:43,579 --> 00:01:47,729 -We use a large negative number for the masking, -since its exponential will then be 0. - -28 -00:01:47,729 --> 00:01:53,610 -Now the probability for each start and end -position corresponding to a possible answer, - -29 -00:01:53,610 --> 00:01:57,600 -we give a score that is the product of the -start probabilities and end probabilities - -30 -00:01:57,600 --> 00:02:00,180 -at those positions. - -31 -00:02:00,180 --> 00:02:05,430 -Of course, a start index greater than an end -index corresponds to an impossible answer. - -32 -00:02:05,430 --> 00:02:08,940 -Here is the code to find the best score for -a possible answer. - -33 -00:02:08,940 --> 00:02:13,070 -Once we have the start and end positions of -the tokens, we use the offset mappings provided - -34 -00:02:13,070 --> 00:02:18,270 -by our tokenizer to find the span of characters -in the initial context, and get our answer! - -35 -00:02:18,270 --> 00:02:23,820 -Now, when the context is long, it might get -truncated by the tokenizer. - -36 -00:02:23,820 --> 00:02:29,099 -This might result in part of the answer, or -worse, the whole answer, being truncated. - -37 -00:02:29,099 --> 00:02:33,319 -So we don't discard the truncated tokens but -build new features with them. - -38 -00:02:33,319 --> 00:02:39,320 -Each of those features contains the question, -then a chunk of text in the context. - -39 -00:02:39,320 --> 00:02:43,760 -If we take disjoint chunks of texts, we might -end up with the answer being split between - -40 -00:02:43,760 --> 00:02:45,330 -two features. - -41 -00:02:45,330 --> 00:02:49,709 -So instead, we take overlapping chunks of -texts, to make sure at least one of the chunks - -42 -00:02:49,709 --> 00:02:51,650 -will fully contain the answer to the question. - -43 -00:02:51,650 --> 00:02:56,920 -The tokenizers do all of this for us automatically -with the return overflowing tokens option. - -44 -00:02:56,920 --> 00:03:02,069 -The stride argument controls the number of -overlapping tokens. - -45 -00:03:02,069 --> 00:03:05,930 -Here is how our very long context gets truncated -in two features with some overlap. - -46 -00:03:05,930 --> 00:03:10,051 -By applying the same post-processing we saw -before for each feature, we get the answer - -47 -00:03:10,051 --> 00:03:18,349 -with a score for each of them, and we take -the answer with the best score as a final - -48 -00:03:18,349 --> 00:03:21,239 -solution. +1 +00:00:04,230 --> 00:00:07,699 +- Let's have a look inside the +question answering pipeline. + +2 +00:00:07,699 --> 00:00:10,680 +The question answering +pipeline can extracts answers + +3 +00:00:10,680 --> 00:00:14,190 +to questions from a given +context or passage of text, + +4 +00:00:14,190 --> 00:00:16,540 +like this part of the +transformers repo README. + +5 +00:00:18,060 --> 00:00:20,310 +It also works for very long contexts, + +6 +00:00:20,310 --> 00:00:23,850 +even if the answer is at the +very end, like in this example. + +7 +00:00:23,850 --> 00:00:25,400 +In this video, we will see why. + +8 +00:00:26,820 --> 00:00:29,460 +The question answering +pipeline follows the same steps + +9 +00:00:29,460 --> 00:00:31,050 +as the other pipelines: + +10 +00:00:31,050 --> 00:00:34,200 +the question and context are +tokenized as a sentence pair, + +11 +00:00:34,200 --> 00:00:37,955 +fed to the model then some +post-processing is applied. + +12 +00:00:37,955 --> 00:00:41,730 +The tokenization and model +steps should be familiar. + +13 +00:00:41,730 --> 00:00:44,610 +We use the auto class suitable +for question answering + +14 +00:00:44,610 --> 00:00:47,070 +instead of sequence classification, + +15 +00:00:47,070 --> 00:00:49,392 +but one key difference +with text classification + +16 +00:00:49,392 --> 00:00:52,980 +is that our model outputs two +tensors named start logits + +17 +00:00:52,980 --> 00:00:54,570 +and end logits. + +18 +00:00:54,570 --> 00:00:55,830 +Why is that? + +19 +00:00:55,830 --> 00:00:57,930 +Well, this is the way the +model finds the answer + +20 +00:00:57,930 --> 00:00:58,803 +to the question. + +21 +00:00:59,790 --> 00:01:02,130 +First, let's have a look +at the model inputs. + +22 +00:01:02,130 --> 00:01:04,350 +Its numbers associated +with the tokenization + +23 +00:01:04,350 --> 00:01:06,843 +of the question followed by the context + +24 +00:01:06,843 --> 00:01:09,723 +with the usual CLS and SEP special tokens. + +25 +00:01:10,620 --> 00:01:13,320 +The answer is a part of those tokens. + +26 +00:01:13,320 --> 00:01:15,510 +So we ask the model to +predict which token starts + +27 +00:01:15,510 --> 00:01:17,373 +the answer and which ends the answer. + +28 +00:01:18,548 --> 00:01:19,650 +For our two logit outputs, + +29 +00:01:19,650 --> 00:01:22,803 +the theoretical labels are +the pink and purple vectors. + +30 +00:01:24,300 --> 00:01:26,430 +To convert those logits +into probabilities, + +31 +00:01:26,430 --> 00:01:28,436 +we will need to apply a SoftMax, + +32 +00:01:28,436 --> 00:01:30,360 +like in the text classification pipeline. + +33 +00:01:30,360 --> 00:01:33,390 +We just mask the tokens that +are not part of the context + +34 +00:01:33,390 --> 00:01:36,855 +before doing that, leaving +the initial CLS token unmasked + +35 +00:01:36,855 --> 00:01:39,303 +as we use it to predict +an impossible answer. + +36 +00:01:40,267 --> 00:01:43,500 +This is what it looks in terms of code. + +37 +00:01:43,500 --> 00:01:45,870 +We use a large negative +number for the masking, + +38 +00:01:45,870 --> 00:01:48,957 +since its exponential will then be zero. + +39 +00:01:48,957 --> 00:01:50,580 +Now, the probability for each start + +40 +00:01:50,580 --> 00:01:53,550 +and end position corresponding +to a possible answer, + +41 +00:01:53,550 --> 00:01:55,050 +we give a score that is the product + +42 +00:01:55,050 --> 00:01:57,630 +of the start probabilities +and end probabilities + +43 +00:01:57,630 --> 00:01:58,803 +at those positions. + +44 +00:02:00,120 --> 00:02:02,670 +Of course, a start index +greater than an end index + +45 +00:02:02,670 --> 00:02:04,503 +corresponds to an impossible answer. + +46 +00:02:05,430 --> 00:02:07,080 +Here is the code to find the best score + +47 +00:02:07,080 --> 00:02:08,820 +for a possible answer. + +48 +00:02:08,820 --> 00:02:11,430 +Once we have the start and +end positions of the tokens, + +49 +00:02:11,430 --> 00:02:14,130 +we use the offset mappings +provided by our tokenizer + +50 +00:02:14,130 --> 00:02:16,950 +to find the span of characters +in the initial context, + +51 +00:02:16,950 --> 00:02:17,900 +and get our answer. + +52 +00:02:19,470 --> 00:02:21,900 +Now, when the context is +long, it might get truncated + +53 +00:02:21,900 --> 00:02:22,750 +by the tokenizer. + +54 +00:02:23,760 --> 00:02:26,220 +This might result in part +of the answer, or worse, + +55 +00:02:26,220 --> 00:02:28,113 +the whole answer, being truncated. + +56 +00:02:29,100 --> 00:02:31,050 +So we don't discard the truncated tokens + +57 +00:02:31,050 --> 00:02:33,330 +but build new features with them. + +58 +00:02:33,330 --> 00:02:35,994 +Each of those features +contains the question, + +59 +00:02:35,994 --> 00:02:39,240 +then a chunk of text in the context. + +60 +00:02:39,240 --> 00:02:41,430 +If we take disjoint chunks of texts, + +61 +00:02:41,430 --> 00:02:43,530 +we might end up with +the answer being split + +62 +00:02:43,530 --> 00:02:45,330 +between two features. + +63 +00:02:45,330 --> 00:02:48,060 +So instead, we take +overlapping chunks of texts, + +64 +00:02:48,060 --> 00:02:50,640 +to make sure at least one of +the chunks will fully contain + +65 +00:02:50,640 --> 00:02:51,990 +the answer to the question. + +66 +00:02:52,830 --> 00:02:55,260 +The tokenizers do all of +this for us automatically + +67 +00:02:55,260 --> 00:02:58,170 +with the return overflowing tokens option. + +68 +00:02:58,170 --> 00:02:59,700 +The stride argument controls + +69 +00:02:59,700 --> 00:03:02,070 +the number of overlapping tokens. + +70 +00:03:02,070 --> 00:03:04,020 +Here is how our very long +context gets truncated + +71 +00:03:04,020 --> 00:03:05,850 +in two features with some overlap. + +72 +00:03:05,850 --> 00:03:07,950 +By applying the same +post-processing we saw before + +73 +00:03:07,950 --> 00:03:10,636 +for each feature, we get +the answer with a score + +74 +00:03:10,636 --> 00:03:12,453 +for each of them, + +75 +00:03:12,453 --> 00:03:14,910 +and we take the answer with the best score + +76 +00:03:14,910 --> 00:03:16,203 +as a final solution. + diff --git a/subtitles/en/48_inside-the-question-answering-pipeline-(tensorflow).srt b/subtitles/en/48_inside-the-question-answering-pipeline-(tensorflow).srt index 07858744e..90725ddf5 100644 --- a/subtitles/en/48_inside-the-question-answering-pipeline-(tensorflow).srt +++ b/subtitles/en/48_inside-the-question-answering-pipeline-(tensorflow).srt @@ -1,177 +1,358 @@ -1 -00:00:05,360 --> 00:00:07,920 -Let's have a look inside the  -question answering pipeline.   - -2 -00:00:09,600 --> 00:00:13,520 -The question answering pipeline  -can extracts answers to questions   - -3 -00:00:13,520 --> 00:00:17,840 -from a given context or passage of text, like  -this part of the Transformers repo README.   - -4 -00:00:19,040 --> 00:00:23,680 -It also works for very long contexts, even if the  -answer is at the very end, like in this example.   - -5 -00:00:24,480 --> 00:00:25,840 -In this video, we will see why!   - -6 -00:00:27,600 --> 00:00:32,720 -The question answering pipeline follows the same  -steps as the other pipelines: the question and   - -7 -00:00:32,720 --> 00:00:38,160 -context are tokenized as a sentence pair, fed to  -the model then some post-processing is applied.   - -8 -00:00:39,280 --> 00:00:44,160 -The tokenization and model steps should be  -familiar. We use the auto class suitable for   - -9 -00:00:44,160 --> 00:00:48,240 -Question Answering instead of sequence  -classification, but one key difference   - -10 -00:00:48,240 --> 00:00:53,680 -with text classification is that our model  -outputs two tensors named start logits and   - -11 -00:00:53,680 --> 00:00:58,640 -end logits. Why is that? Well this is the way  -the model finds the answer to the question.   - -12 -00:00:59,920 --> 00:01:04,880 -First let's have a look at the model inputs. It's  -numbers associated with the tokenization of the   - -13 -00:01:04,880 --> 00:01:12,160 -question followed by the context (with the usual  -CLS and SEP special tokens). The answer is a part   - -14 -00:01:12,160 --> 00:01:17,920 -of those tokens. So we ask the model to predict  -which token starts the answer and which ends the   - -15 -00:01:17,920 --> 00:01:25,040 -answer. For our two logit outputs, the theoretical  -labels are the pink and purple vectors. To convert   - -16 -00:01:25,040 --> 00:01:29,520 -those logits into probabilities, we will need to  -apply a SoftMax, like in the text classification   - -17 -00:01:29,520 --> 00:01:36,240 -pipeline. We just mask the tokens that are not  -part of the context before doing that, leaving the   - -18 -00:01:36,240 --> 00:01:43,200 -initial CLS token unmasked as we use it to predict  -an impossible answer. This is what it looks in   - -19 -00:01:43,200 --> 00:01:49,200 -terms of code. We use a large negative number for  -the masking, since its exponential will then be 0.   - -20 -00:01:50,480 --> 00:01:54,640 -Now the probability for each start and end  -position corresponding to a possible answer,   - -21 -00:01:55,520 --> 00:02:00,000 -we give a score that is the product of the start  -probabilities and end probabilities at those   - -22 -00:02:00,000 --> 00:02:06,000 -positions. Of course, a start index greater than  -an end index corresponds to an impossible answer.   - -23 -00:02:07,360 --> 00:02:12,080 -Here is the code to find the best score for  -a possible answer. Once we have the start and   - -24 -00:02:12,080 --> 00:02:17,040 -end positions of the tokens, we use the offset  -mappings provided by our tokenizer to find the   - -25 -00:02:17,040 --> 00:02:23,520 -span of characters in the initial context, and  -get our answer! Now, when the context is long,   - -26 -00:02:23,520 --> 00:02:29,440 -it might get truncated by the tokenizer. This  -might result in part of the answer, or worse, the   - -27 -00:02:29,440 --> 00:02:34,800 -whole answer, being truncated. So we don't discard  -the truncated tokens but build new features   - -28 -00:02:34,800 --> 00:02:42,080 -with them. Each of those features contains the  -question, then a chunk of text in the context. If   - -29 -00:02:42,080 --> 00:02:47,280 -we take disjoint chunks of texts, we might end up  -with the answer being split between two features.   - -30 -00:02:48,560 --> 00:02:51,840 -So instead, we take overlapping chunks of texts,   - -31 -00:02:51,840 --> 00:02:55,520 -to make sure at least one of the chunks will  -fully contain the answer to the question.   - -32 -00:02:56,720 --> 00:03:00,880 -The tokenizers do all of this for us automatically  -with the return overflowing tokens option.   - -33 -00:03:01,680 --> 00:03:04,320 -The stride argument controls the  -number of overlapping tokens.   - -34 -00:03:05,680 --> 00:03:10,160 -Here is how our very long context gets  -truncated in two features with some overlap.   - -35 -00:03:10,960 --> 00:03:15,520 -By applying the same post-processing we saw  -before for each feature, we get the answer   - -36 -00:03:15,520 --> 00:03:27,600 -with a score for each of them, and we take the  -answer with the best score as a final solution. +1 +00:00:00,000 --> 00:00:03,417 +(light transition music) + +2 +00:00:05,490 --> 00:00:08,440 +- Let's have a look inside the +question answering pipeline. + +3 +00:00:09,780 --> 00:00:11,370 +The question answering pipeline + +4 +00:00:11,370 --> 00:00:13,710 +can extract answers to questions + +5 +00:00:13,710 --> 00:00:16,020 +from a given context or passage of text + +6 +00:00:16,020 --> 00:00:18,370 +like this part of the +Transformers repo README. + +7 +00:00:19,290 --> 00:00:21,180 +It also works for very long context, + +8 +00:00:21,180 --> 00:00:24,720 +even if the answer is at the +very end, like in this example. + +9 +00:00:24,720 --> 00:00:26,223 +In this video, we'll see why. + +10 +00:00:27,840 --> 00:00:29,310 +The question answering pipeline + +11 +00:00:29,310 --> 00:00:32,130 +follows the same steps +as the other pipelines. + +12 +00:00:32,130 --> 00:00:35,550 +The question and context are +tokenized as a sentence pair, + +13 +00:00:35,550 --> 00:00:38,463 +fed to the model then some +post-processing is applied. + +14 +00:00:39,540 --> 00:00:42,840 +So tokenization and model +steps should be familiar. + +15 +00:00:42,840 --> 00:00:45,000 +We use the auto class suitable +for question answering + +16 +00:00:45,000 --> 00:00:47,460 +instead of sequence classification, + +17 +00:00:47,460 --> 00:00:50,190 +but one key difference +with text classification + +18 +00:00:50,190 --> 00:00:52,380 +is that our model outputs two tensors + +19 +00:00:52,380 --> 00:00:55,230 +named start logits and end logits. + +20 +00:00:55,230 --> 00:00:56,160 +Why is that? + +21 +00:00:56,160 --> 00:00:58,170 +Well, this is the way the +model finds the answer + +22 +00:00:58,170 --> 00:00:59,043 +to the question. + +23 +00:01:00,090 --> 00:01:02,610 +First, let's have a look +at the model inputs. + +24 +00:01:02,610 --> 00:01:04,800 +It's numbers associated +with the tokenization + +25 +00:01:04,800 --> 00:01:05,850 +of the question, + +26 +00:01:05,850 --> 00:01:07,753 +followed by the context + +27 +00:01:07,753 --> 00:01:10,233 +with the usual CLS and SEP special tokens. + +28 +00:01:11,130 --> 00:01:13,203 +The answer is a part of those tokens. + +29 +00:01:14,040 --> 00:01:15,330 +So we ask the model to predict + +30 +00:01:15,330 --> 00:01:17,040 +which token starts the answer + +31 +00:01:17,040 --> 00:01:19,320 +and which ends the answer. + +32 +00:01:19,320 --> 00:01:20,910 +For our two logit outputs, + +33 +00:01:20,910 --> 00:01:23,823 +the theoretical labels are +the pink and purple vectors. + +34 +00:01:24,870 --> 00:01:26,700 +To convert those logits +into probabilities, + +35 +00:01:26,700 --> 00:01:28,596 +we will need to apply a SoftMax, + +36 +00:01:28,596 --> 00:01:31,020 +like in the text classification pipeline. + +37 +00:01:31,020 --> 00:01:32,310 +We just mask the tokens + +38 +00:01:32,310 --> 00:01:35,940 +that are not part of the +context before doing that, + +39 +00:01:35,940 --> 00:01:38,310 +leaving the initial CLS token unmasked + +40 +00:01:38,310 --> 00:01:40,773 +as we use it to predict +an impossible answer. + +41 +00:01:41,940 --> 00:01:44,730 +This is what it looks +like in terms of code. + +42 +00:01:44,730 --> 00:01:47,340 +We use a large negative +number for the masking + +43 +00:01:47,340 --> 00:01:49,533 +since its exponential will then be zero. + +44 +00:01:50,850 --> 00:01:53,160 +Now the probability for +each start and end position + +45 +00:01:53,160 --> 00:01:55,740 +corresponding to a possible answer + +46 +00:01:55,740 --> 00:01:57,540 +will give a score that is a product + +47 +00:01:57,540 --> 00:01:58,680 +of the start probabilities + +48 +00:01:58,680 --> 00:02:00,873 +and end probabilities at those position. + +49 +00:02:01,920 --> 00:02:04,530 +Of course, a start index +greater than an end index + +50 +00:02:04,530 --> 00:02:06,330 +corresponds to an impossible answer. + +51 +00:02:07,744 --> 00:02:09,510 +Here is the code to find the best score + +52 +00:02:09,510 --> 00:02:11,280 +for a possible answer. + +53 +00:02:11,280 --> 00:02:13,830 +Once we have the start and +end position for the tokens, + +54 +00:02:13,830 --> 00:02:16,650 +we use the offset mappings +provided by our tokenizer + +55 +00:02:16,650 --> 00:02:19,710 +to find the span of characters +in the initial context, + +56 +00:02:19,710 --> 00:02:20,810 +and we get our answer. + +57 +00:02:22,080 --> 00:02:23,700 +Now, when the context is long, + +58 +00:02:23,700 --> 00:02:25,977 +it might get truncated by the tokenizer. + +59 +00:02:26,834 --> 00:02:29,790 +This might result in part +of the answer, or worse, + +60 +00:02:29,790 --> 00:02:32,190 +the whole answer, being truncated. + +61 +00:02:32,190 --> 00:02:34,020 +So we don't discard the truncated tokens + +62 +00:02:34,020 --> 00:02:36,420 +but build new features with them. + +63 +00:02:36,420 --> 00:02:39,330 +Each of those features +contains the question, + +64 +00:02:39,330 --> 00:02:42,150 +then a chunk of text in the context. + +65 +00:02:42,150 --> 00:02:44,520 +If we take disjoint chunks of texts, + +66 +00:02:44,520 --> 00:02:45,840 +we might end up with the answer + +67 +00:02:45,840 --> 00:02:47,733 +being split between two features. + +68 +00:02:48,720 --> 00:02:52,050 +So instead, we take +overlapping chunks of text + +69 +00:02:52,050 --> 00:02:53,910 +to make sure at least one of the chunks + +70 +00:02:53,910 --> 00:02:56,940 +will fully contain the +answer to the question. + +71 +00:02:56,940 --> 00:02:59,220 +So, tokenizers does all of +this for us automatically + +72 +00:02:59,220 --> 00:03:01,920 +with the return overflowing tokens option. + +73 +00:03:01,920 --> 00:03:02,753 +The stride argument + +74 +00:03:02,753 --> 00:03:04,830 +controls the number of overlapping tokens. + +75 +00:03:05,940 --> 00:03:07,740 +Here is how our very long context + +76 +00:03:07,740 --> 00:03:10,323 +gets truncated in two +features with some overlap. + +77 +00:03:11,160 --> 00:03:12,720 +By applying the same post-processing + +78 +00:03:12,720 --> 00:03:14,850 +we saw before for each feature, + +79 +00:03:14,850 --> 00:03:17,970 +we get the answer with a +score for each of them, + +80 +00:03:17,970 --> 00:03:19,920 +and we take the answer with the best score + +81 +00:03:19,920 --> 00:03:21,303 +as a final solution. + +82 +00:03:23,089 --> 00:03:26,506 +(light transition music) + diff --git a/subtitles/en/49_what-is-normalization.srt b/subtitles/en/49_what-is-normalization.srt index 090be7355..8087de3d1 100644 --- a/subtitles/en/49_what-is-normalization.srt +++ b/subtitles/en/49_what-is-normalization.srt @@ -1,250 +1,414 @@ -1 -00:00:05,130 --> 00:00:11,060 -In this video we will see together what is -the normalizer component that we find at the - -2 -00:00:11,060 --> 00:00:12,240 -beginning of each tokenizer. - -3 -00:00:12,240 --> 00:00:20,610 -The normalization operation consists in applying -a succession of normalization rules to the - -4 -00:00:20,610 --> 00:00:21,960 -raw text. - -5 -00:00:21,960 --> 00:00:27,510 -We choose normalization rules to remove noise -in the text which seems useless for the learning - -6 -00:00:27,510 --> 00:00:31,420 -and use of our language model. - -7 -00:00:31,420 --> 00:00:40,790 -Let's take a very diverse sentence with different -fonts, upper and lower case characters, accents, - -8 -00:00:40,790 --> 00:00:48,490 -punctuation and multiple spaces, to see how -several tokenizers normalize it. - -9 -00:00:48,490 --> 00:00:55,039 -The tokenizer from the FNet model has transformed -the letters with font variants or circled - -10 -00:00:55,039 --> 00:01:00,230 -into their basic version and has removed the -multiple spaces. - -11 -00:01:00,230 --> 00:01:07,090 -And now if we look at the normalization with -Retribert's tokenizer, we can see that it - -12 -00:01:07,090 --> 00:01:12,990 -keeps characters with several font variants -and keeps the multiple spaces but it removes - -13 -00:01:12,990 --> 00:01:15,659 -all the accents. - -14 -00:01:15,659 --> 00:01:23,050 -And if we continue to test the normalization -of many other tokenizers associated to models - -15 -00:01:23,050 --> 00:01:34,079 -that you can find on the Hub we can see that -they also propose other normalizations. - -16 -00:01:34,079 --> 00:01:39,310 -With the fast tokenizers, it is very easy -to observe the normalization chosen for the - -17 -00:01:39,310 --> 00:01:42,500 -currently loaded tokenizer. - -18 -00:01:42,500 --> 00:01:49,250 -Indeed, each instance of a fast tokenizer -has an underlying tokenizer from the Tokenizers - -19 -00:01:49,250 --> 00:01:54,820 -library stored in the backend_tokenizer attribute. - -20 -00:01:54,820 --> 00:02:01,070 -This object has itself a normalizer attribute -that we can use thanks to the "normalize_str" - -21 -00:02:01,070 --> 00:02:04,670 -method to normalize a string. - -22 -00:02:04,670 --> 00:02:11,000 -It is thus very practical that this normalization -which was used at the time of the training - -23 -00:02:11,000 --> 00:02:17,870 -of the tokenizer was saved and that it applies -automatically when you asks a trained tokenizer - -24 -00:02:17,870 --> 00:02:21,120 -to tokenize a text. - -25 -00:02:21,120 --> 00:02:28,130 -For example, if we hadn't included the albert -normalizer we would have had a lot of unknown - -26 -00:02:28,130 --> 00:02:35,870 -tokens by tokenizing this sentence with accents -and capital letters. - -27 -00:02:35,870 --> 00:02:40,319 -These transformations can also be undetectable -with a simple "print". - -28 -00:02:40,319 --> 00:02:46,069 -Indeed, keep in mind that for a computer, -text is only a succession of 0 and 1 and it - -29 -00:02:46,069 --> 00:02:51,230 -happens that different successions of 0 and -1 render the same printed character. - -30 -00:02:51,230 --> 00:02:57,459 -The 0s and 1s go in groups of 8 to form a -byte. - -31 -00:02:57,459 --> 00:03:04,490 -The computer must then decode this sequence -of bytes into a sequence of "code points". - -32 -00:03:04,490 --> 00:03:10,959 -In our example the 2 bytes are transformed -into a single "code point" by UTF-8. - -33 -00:03:10,959 --> 00:03:18,860 -The unicode standard then allows us to find -the character corresponding to this code point: - -34 -00:03:18,860 --> 00:03:22,140 -the c cedilla. - -35 -00:03:22,140 --> 00:03:28,060 -Let's repeat the same operation with this -new sequence composed of 3 bytes, this time - -36 -00:03:28,060 --> 00:03:34,450 -it is transformed into 2 "code points" .... which -also correspond to the c cedilla character! - -37 -00:03:34,450 --> 00:03:41,510 -It is in fact the composition of the unicode -Latin Small Letter Cand the combining cedilla. - -38 -00:03:41,510 --> 00:03:47,819 -But it's annoying because what appears to -us to be a single character is not at all - -39 -00:03:47,819 --> 00:03:52,379 -the same thing for the computer. - -40 -00:03:52,379 --> 00:04:02,269 -Fortunately, there are unicode standardization -standards known as NFC, NFD, NFKC and NFKD - -41 -00:04:02,269 --> 00:04:05,430 -that allow erasing some of these differences. - -42 -00:04:05,430 --> 00:04:10,019 -These standards are often used by tokenizers! - -43 -00:04:10,019 --> 00:04:15,239 -On all these previous examples, even if the -normalizations changed the look of the text, - -44 -00:04:15,239 --> 00:04:21,229 -they did not change the content: you could -still read "Hello world, let's normalize this - -45 -00:04:21,229 --> 00:04:22,540 -sentence". - -46 -00:04:22,540 --> 00:04:30,120 -However, you must be aware that some normalizations -can be very harmful if they are not adapted - -47 -00:04:30,120 --> 00:04:31,720 -to their corpus. - -48 -00:04:31,720 --> 00:04:37,360 -For example, if you take the French sentence -"un père indigné", which means "An indignant - -49 -00:04:37,360 --> 00:04:45,660 -father", and normalize it with the bert-base-uncase -tokenizer which removes the accents then the - -50 -00:04:45,660 --> 00:04:53,550 -sentence becomes "un père indigne" which -means "An unworthy father". - -51 -00:04:53,550 --> 00:04:58,699 -If you watch this video to build your own -tokenizer, there are no absolute rules to - -52 -00:04:58,699 --> 00:05:04,580 -choose or not a normalization for your brand -new tokenizer but I advise you to take the - -53 -00:05:04,580 --> 00:05:15,960 -time to select them so that they do not make -you lose important information. +1 +00:00:00,286 --> 00:00:02,869 +(subtle blast) + +2 +00:00:04,694 --> 00:00:07,380 +- In this video, we will see together + +3 +00:00:07,380 --> 00:00:09,930 +what is the normalizer component + +4 +00:00:09,930 --> 00:00:13,023 +that we'd find at the +beginning of each tokenizer. + +5 +00:00:14,550 --> 00:00:16,830 +The normalization operation consists + +6 +00:00:16,830 --> 00:00:19,890 +in applying a succession +of normalization rules + +7 +00:00:19,890 --> 00:00:20,853 +to the raw text. + +8 +00:00:21,870 --> 00:00:25,710 +We choose normalization rules +to remove noise in the text + +9 +00:00:25,710 --> 00:00:27,900 +which seem useless for the learning + +10 +00:00:27,900 --> 00:00:30,363 +and use of our language model. + +11 +00:00:33,090 --> 00:00:37,470 +Let's take a very diverse +sentence with different fonts, + +12 +00:00:37,470 --> 00:00:39,780 +upper and lower case characters, + +13 +00:00:39,780 --> 00:00:43,083 +accents, punctuation and multiple spaces, + +14 +00:00:43,920 --> 00:00:46,683 +to see how several tokenizer normalize it. + +15 +00:00:48,488 --> 00:00:50,730 +The tokenizer from the FNet model + +16 +00:00:50,730 --> 00:00:53,700 +has transformed the +letter with font variants + +17 +00:00:53,700 --> 00:00:57,480 +or circled into their basic version + +18 +00:00:57,480 --> 00:00:59,733 +and has removed the multiple spaces. + +19 +00:01:00,960 --> 00:01:03,960 +And now if we look at the normalization + +20 +00:01:03,960 --> 00:01:05,880 +with Retribert's tokenizer, + +21 +00:01:05,880 --> 00:01:08,010 +we can see that it keeps characters + +22 +00:01:08,010 --> 00:01:12,090 +with several font variants +and keeps the multiple spaces, + +23 +00:01:12,090 --> 00:01:14,223 +but it removes all the accents. + +24 +00:01:16,170 --> 00:01:18,870 +And if we continue to +test this normalization + +25 +00:01:18,870 --> 00:01:23,040 +of many other tokenizers +associated to models + +26 +00:01:23,040 --> 00:01:25,110 +that we can find on the Hub, + +27 +00:01:25,110 --> 00:01:28,833 +we see that they also propose +other kind of normalization. + +28 +00:01:33,900 --> 00:01:35,850 +With the fast tokenizers, + +29 +00:01:35,850 --> 00:01:39,060 +it's very easy to observe +the normalization chosen + +30 +00:01:39,060 --> 00:01:41,193 +for the currently loaded tokenizer. + +31 +00:01:42,330 --> 00:01:46,140 +Indeed, each instance of a fast tokenizer + +32 +00:01:46,140 --> 00:01:48,030 +has an underlying tokenizer + +33 +00:01:48,030 --> 00:01:51,390 +from the HuggingFace +Tokenizers library stored + +34 +00:01:51,390 --> 00:01:53,643 +in the backend_tokenizer attribute. + +35 +00:01:54,690 --> 00:01:58,470 +This object has itself +a normalizer attribute + +36 +00:01:58,470 --> 00:02:01,830 +that we can use thanks to +the normalize_str method + +37 +00:02:01,830 --> 00:02:03,153 +to normalize a string. + +38 +00:02:04,560 --> 00:02:08,700 +It is thus very practical +that this normalization, + +39 +00:02:08,700 --> 00:02:11,070 +which was used at the time of the training + +40 +00:02:11,070 --> 00:02:12,903 +of the tokenizer was saved, + +41 +00:02:13,857 --> 00:02:16,200 +and that it applies automatically + +42 +00:02:16,200 --> 00:02:19,233 +when you ask a trained +tokenizer to tokenize a text. + +43 +00:02:21,000 --> 00:02:25,500 +For example, if we hadn't +included the albert normalizer, + +44 +00:02:25,500 --> 00:02:28,770 +we would have had a lot of unknown tokens + +45 +00:02:28,770 --> 00:02:30,930 +by tokenizing this sentence + +46 +00:02:30,930 --> 00:02:33,213 +with accents and capital letters. + +47 +00:02:35,730 --> 00:02:38,370 +This transformation can +also be undetectable + +48 +00:02:38,370 --> 00:02:40,050 +with a simple print. + +49 +00:02:40,050 --> 00:02:42,810 +Indeed, keep in mind that for a computer, + +50 +00:02:42,810 --> 00:02:45,840 +text is only a succession of 0 and 1, + +51 +00:02:45,840 --> 00:02:47,820 +and it happens that different successions + +52 +00:02:47,820 --> 00:02:51,363 +of 0 and 1 render the +same printed character. + +53 +00:02:52,380 --> 00:02:56,403 +The 0 and 1 go in group +of 8 to form a byte. + +54 +00:02:57,480 --> 00:03:00,690 +The computer must then +decode this sequence of bytes + +55 +00:03:00,690 --> 00:03:02,493 +into a sequence of code points. + +56 +00:03:04,530 --> 00:03:09,530 +In our example, the 2 bytes +is decoded using UTF-8 + +57 +00:03:09,900 --> 00:03:11,403 +into a single code point. + +58 +00:03:12,450 --> 00:03:15,090 +The unicode standard then allows us + +59 +00:03:15,090 --> 00:03:18,191 +to find the character +corresponding to this code point, + +60 +00:03:18,191 --> 00:03:20,283 +the c cedilla. + +61 +00:03:21,499 --> 00:03:23,790 +Let's repeat the same operation + +62 +00:03:23,790 --> 00:03:26,577 +with this new sequence +composed of 3 bytes,. + +63 +00:03:27,420 --> 00:03:30,543 +This time it is transformed +into two code points, + +64 +00:03:31,410 --> 00:03:35,280 +which also correspond to +the c cedilla character. + +65 +00:03:35,280 --> 00:03:36,780 +It is in fact the composition + +66 +00:03:36,780 --> 00:03:39,810 +of the unicode Latin Small Letter C + +67 +00:03:39,810 --> 00:03:42,240 +and the combining cedilla. + +68 +00:03:42,240 --> 00:03:45,000 +But it's annoying because +what appears to us + +69 +00:03:45,000 --> 00:03:46,680 +to be a single character + +70 +00:03:46,680 --> 00:03:49,653 +is not at all the same +thing for the computer. + +71 +00:03:52,470 --> 00:03:57,240 +Fortunately, there are unicode +standardization standards + +72 +00:03:57,240 --> 00:04:02,130 +known as NFC, NFD, NFKC or NFKD + +73 +00:04:02,130 --> 00:04:04,893 +that allow erasing some +of these differences. + +74 +00:04:05,730 --> 00:04:08,223 +These standards are +often used by tokenizers. + +75 +00:04:09,900 --> 00:04:12,090 +On all these previous examples, + +76 +00:04:12,090 --> 00:04:15,510 +even if the normalizations +changed the look of the text, + +77 +00:04:15,510 --> 00:04:17,970 +they did not change the content; + +78 +00:04:17,970 --> 00:04:19,177 +you could still read, + +79 +00:04:19,177 --> 00:04:21,987 +"Hello world, let's +normalize this sentence." + +80 +00:04:22,980 --> 00:04:25,980 +However, you must be aware +that some normalizations + +81 +00:04:25,980 --> 00:04:30,363 +can be very harmful if they are +not adapted to their corpus. + +82 +00:04:31,620 --> 00:04:34,387 +For example, if you take +the French sentence, + +83 +00:04:34,387 --> 00:04:38,790 +"Un pere indigne," which +means "An indignant father," + +84 +00:04:38,790 --> 00:04:42,510 +and normalize it with the +bert-base-uncase tokenizer + +85 +00:04:42,510 --> 00:04:44,313 +which removes the accent, + +86 +00:04:45,150 --> 00:04:48,000 +then the sentence +becomes "Un pere indigne" + +87 +00:04:48,000 --> 00:04:49,707 +which means "An unworthy father". + +88 +00:04:53,460 --> 00:04:56,760 +If you watched this video +to build your own tokenizer, + +89 +00:04:56,760 --> 00:04:59,610 +there are no absolute +rules to choose or not + +90 +00:04:59,610 --> 00:05:02,970 +a normalization for a new tokenizer, + +91 +00:05:02,970 --> 00:05:06,210 +but I advise you to take +the time to select them + +92 +00:05:06,210 --> 00:05:10,743 +so that they do not make you +lose important information. + +93 +00:05:12,296 --> 00:05:14,879 +(subtle blast) + diff --git a/subtitles/en/50_what-is-pre-tokenization.srt b/subtitles/en/50_what-is-pre-tokenization.srt index 425b0c7a0..840595abc 100644 --- a/subtitles/en/50_what-is-pre-tokenization.srt +++ b/subtitles/en/50_what-is-pre-tokenization.srt @@ -1,115 +1,193 @@ -1 -00:00:05,549 --> 00:00:12,309 -The tokenization pipeline involves several -steps that convert raw text into numbers. - -2 -00:00:12,309 --> 00:00:15,990 -In this video, we will see what happens during -the pre-tokenization step. - -3 -00:00:15,990 --> 00:00:23,840 -The pre-tokenization operation is the operation -performed after the normalization of the text - -4 -00:00:23,840 --> 00:00:28,830 -and before the application of the tokenization -algorithm. - -5 -00:00:28,830 --> 00:00:33,489 -This step consists in applying rules that -do not need to be learned to perform a first - -6 -00:00:33,489 --> 00:00:38,270 -division of the text. - -7 -00:00:38,270 --> 00:00:46,270 -Let's look at how several tokenizers pre_tokenize -this example. - -8 -00:00:46,270 --> 00:00:53,430 -The gpt 2 pretokenization divides the text -on spaces and some punctuation - but the apostrophe - -9 -00:00:53,430 --> 00:00:57,840 -is not a division criterion for example. - -10 -00:00:57,840 --> 00:01:06,580 -We also notice that spaces have been replaced -by a capital G with a dot above. - -11 -00:01:06,580 --> 00:01:12,900 -Albert's pre-tokenization divides the text -at the level of spaces, adds a space at the - -12 -00:01:12,900 --> 00:01:19,610 -beginning of the sentence and replaces spaces -with a special underscore. - -13 -00:01:19,610 --> 00:01:29,320 -Finally, Bert's pre-tokenization divides the -text at the level of punctuation and spaces. - -14 -00:01:29,320 --> 00:01:35,460 -Unlike the previous tokenizers, spaces are -not transformed and integrated to the tokens - -15 -00:01:35,460 --> 00:01:40,079 -produced with this pre-tokenizer. - -16 -00:01:40,079 --> 00:01:45,860 -Through these 3 examples, we could observe -the two main types of operations brought by - -17 -00:01:45,860 --> 00:01:54,210 -the pre-tokenization: some changes on the -text and the division of the string into tokens - -18 -00:01:54,210 --> 00:01:57,259 -that can be associated to words. - -19 -00:01:57,259 --> 00:02:06,729 -Finally, the "backend_tokenizer" of the fast -tokenizers also allows to test the pre-tokenization - -20 -00:02:06,729 --> 00:02:12,739 -operation very easily thanks to its "pre_tokenize_str" -method. - -21 -00:02:12,739 --> 00:02:18,740 -We notice that the output of this operation -is composed of both tokens and offsets which - -22 -00:02:18,740 --> 00:02:24,830 -allow to link the token to its position in -the text given in input of the method. - -23 -00:02:24,830 --> 00:02:32,269 -This operation defines the largest tokens -that can be produced by the tokenization or - -24 -00:02:32,269 --> 00:02:48,389 -in other words the barriers of the sub-tokens -which will be produced then. +1 +00:00:05,550 --> 00:00:08,910 +- The tokenization pipeline +involves several steps + +2 +00:00:08,910 --> 00:00:11,073 +that converts raw text into numbers. + +3 +00:00:12,180 --> 00:00:14,280 +In this video, we will see what happens + +4 +00:00:14,280 --> 00:00:16,293 +during the pre-tokenization step. + +5 +00:00:18,390 --> 00:00:22,110 +The pre-tokenization operation +is the operation performed + +6 +00:00:22,110 --> 00:00:24,630 +after the normalization of the text + +7 +00:00:24,630 --> 00:00:27,633 +and before the application of +the tokenization algorithm. + +8 +00:00:29,112 --> 00:00:31,110 +This step consists in applying rules + +9 +00:00:31,110 --> 00:00:32,550 +that do not need to be learned + +10 +00:00:32,550 --> 00:00:34,563 +to perform a first division of the text. + +11 +00:00:38,160 --> 00:00:41,310 +Let's look at how several tokenizers + +12 +00:00:41,310 --> 00:00:43,143 +pre-tokenize in this example. + +13 +00:00:46,200 --> 00:00:50,820 +The gpt2 pre-tokenization +divides the text on spaces + +14 +00:00:50,820 --> 00:00:55,820 +and some punctuation, but +not on the apostrophe. + +15 +00:00:57,750 --> 00:01:01,170 +We also notice that +spaces have been replaced + +16 +00:01:01,170 --> 00:01:03,813 +by capital G with a dot above. + +17 +00:01:07,170 --> 00:01:09,540 +Albert's pre-tokenization divides the text + +18 +00:01:09,540 --> 00:01:11,043 +at the level of spaces, + +19 +00:01:11,970 --> 00:01:15,300 +adds a space at the +beginning of the sentence, + +20 +00:01:15,300 --> 00:01:18,873 +and replaces spaces with +a special underscore. + +21 +00:01:20,580 --> 00:01:24,780 +Finally, Bert's pre-tokenization +divides the text + +22 +00:01:24,780 --> 00:01:28,083 +at the level of punctuation and spaces. + +23 +00:01:28,920 --> 00:01:31,260 +But unlike the previous tokenizers, + +24 +00:01:31,260 --> 00:01:33,780 +spaces are not transformed + +25 +00:01:33,780 --> 00:01:37,293 +and integrated into tokens +produced with this pre-tokenizer. + +26 +00:01:40,080 --> 00:01:42,120 +Through this three example, + +27 +00:01:42,120 --> 00:01:45,330 +we could observe the two +main type of operation + +28 +00:01:45,330 --> 00:01:47,073 +brought by the pre-tokenization; + +29 +00:01:48,420 --> 00:01:49,900 +some change on the text + +30 +00:01:50,820 --> 00:01:54,180 +and the division of the string into tokens + +31 +00:01:54,180 --> 00:01:56,043 +that can be associated to words. + +32 +00:01:59,430 --> 00:02:04,230 +Finally, the backend tokenizer +of the fast tokenizers + +33 +00:02:04,230 --> 00:02:07,680 +also allows to test the +pre-tokenization operation + +34 +00:02:07,680 --> 00:02:11,253 +very easily, thanks to its +pre_tokenize_str method. + +35 +00:02:12,630 --> 00:02:14,970 +We notice that the +output of this operation + +36 +00:02:14,970 --> 00:02:18,450 +is composed of both tokens and offsets, + +37 +00:02:18,450 --> 00:02:21,960 +which allow to link the tokens +to its position in the text + +38 +00:02:21,960 --> 00:02:23,943 +given in input of the method. + +39 +00:02:25,650 --> 00:02:28,860 +This operation defines the largest tokens + +40 +00:02:28,860 --> 00:02:31,740 +that can be produced by the tokenization, + +41 +00:02:31,740 --> 00:02:36,090 +or in those words, the +barriers of the sub-tokens + +42 +00:02:36,090 --> 00:02:37,653 +which will be produced then. + +43 +00:02:40,050 --> 00:02:41,850 +And that's all for the characteristic + +44 +00:02:41,850 --> 00:02:43,203 +of the pre-tokenizers. + diff --git a/subtitles/en/51_byte-pair-encoding-tokenization.srt b/subtitles/en/51_byte-pair-encoding-tokenization.srt index 6dfbe13d7..79c87c176 100644 --- a/subtitles/en/51_byte-pair-encoding-tokenization.srt +++ b/subtitles/en/51_byte-pair-encoding-tokenization.srt @@ -1,204 +1,377 @@ -1 -00:00:05,120 --> 00:00:07,440 -You are at the right place if you want to   - -2 -00:00:07,440 --> 00:00:15,360 -understand what the Byte pair Encoding subword  -tokenization algorithm is, how to train it   - -3 -00:00:15,360 --> 00:00:18,640 -and how the tokenization of a  -text is done with this algorithm.   - -4 -00:00:21,600 --> 00:00:25,920 -The BPE algorithm was initially  -proposed as a text compression algorithm   - -5 -00:00:26,640 --> 00:00:30,800 -but it is also very well suited as a  -tokenizer for your language models.   - -6 -00:00:32,560 --> 00:00:38,720 -The idea of BPE is to divide words into a  -sequence of "subword units" which are units   - -7 -00:00:38,720 --> 00:00:44,400 -that appear frequently in a reference corpus  -- that is, the corpus we used to train it.   - -8 -00:00:46,560 --> 00:00:53,680 -How is a BPE tokenizer trained? First of all,  -we have to get a corpus of texts. We will not   - -9 -00:00:54,480 --> 00:01:02,080 -train our tokenizer on this raw text but we will  -first normalize it then pre-tokenize it. As the   - -10 -00:01:02,080 --> 00:01:07,520 -pre-tokenization divides the text into a list  -of words, we can represent our corpus in another   - -11 -00:01:07,520 --> 00:01:14,000 -way by gathering together the same words and by  -maintaining a counter, here represented in blue.   - -12 -00:01:17,120 --> 00:01:22,960 -To understand how the training works, we consider  -this toy corpus composed of the following words:   - -13 -00:01:23,520 --> 00:01:32,480 -huggingface, hugging, hug, hugger, etc. BPE is an  -algorithm that starts with an initial vocabulary   - -14 -00:01:32,480 --> 00:01:35,200 -and then increases it to the desired size.   - -15 -00:01:36,240 --> 00:01:41,360 -To build the initial vocabulary, we start  -by separating each word of the corpus   - -16 -00:01:41,360 --> 00:01:46,640 -into a list of elementary units that  -compose them -here the characters.   - -17 -00:01:50,800 --> 00:01:51,360 -We could also have chosen bytes as elementary  -units but it would have been less visual. We list   - -18 -00:01:51,360 --> 00:01:57,760 -in our vocabulary all the characters that appear  -and that will constitute our initial vocabulary!   - -19 -00:02:00,240 --> 00:02:09,840 -Let's now see how to increase it. We return to  -our split corpus, we will go through the words   - -20 -00:02:09,840 --> 00:02:18,480 -one by one and count all the occurrences of token  -pairs. The first pair is composed of the token "h"   - -21 -00:02:18,480 --> 00:02:26,080 -and "u", the second 'u' and "g", and we continue  -like that until we have the complete list.   - -22 -00:02:35,440 --> 00:02:41,200 -Once we know all the pairs and their frequency  -of appearance, we will choose the one that   - -23 -00:02:41,200 --> 00:02:49,840 -appears the most frequently: here it is the  -pair composed of the letters 'l' and 'e'.   - -24 -00:02:51,680 --> 00:02:57,040 -We note our first merging rule and we  -add the new token to our vocabulary.   - -25 -00:03:00,080 --> 00:03:04,080 -We can then apply this merging rule to our splits:   - -26 -00:03:04,080 --> 00:03:09,280 -you can see that we have merged all the pairs  -of tokens composed of the tokens "l" and "e".   - -27 -00:03:13,840 --> 00:03:19,040 -And now we just have to reproduce  -the same steps with our new splits:   - -28 -00:03:21,520 --> 00:03:24,640 -we calculate the frequency of  -occurrence of each pair of tokens,   - -29 -00:03:27,760 --> 00:03:33,680 -we select the pair with the highest  -frequency, we note it in our merge rules,   - -30 -00:03:35,760 --> 00:03:38,720 -we add the new one to the vocabulary   - -31 -00:03:39,600 --> 00:03:46,160 -and then we merge all the pairs of tokens composed  -of the token "le" and "a" into our splits.   - -32 -00:03:50,160 --> 00:03:59,840 -And we can repeat this operation until  -we reach the desired vocabulary size.   - -33 -00:04:05,600 --> 00:04:13,200 -Here we stopped when our vocabulary reached 21  -tokens. We can see now that the words of our   - -34 -00:04:13,200 --> 00:04:20,560 -corpus are now divided into far fewer tokens than  -at the beginning of the training. We can see that   - -35 -00:04:20,560 --> 00:04:27,840 -our algorithm has learned the radicals "hug"  -and "learn" and also the verbal ending "ing".   - -36 -00:04:29,760 --> 00:04:35,600 -Now that we have learned our vocabulary and  -our merging rules, we can tokenize new texts.   - -37 -00:04:37,840 --> 00:04:41,120 -For example, if we want to tokenize the word   - -38 -00:04:41,120 --> 00:04:48,480 -hugs: first we'll divide it into elementary  -units so it became a sequence of characters.   - -39 -00:04:49,840 --> 00:04:53,680 -Then we'll go through our merge rules  -until we have one that we can apply.   - -40 -00:04:54,480 --> 00:05:01,040 -Here we can merge the letters h and u. And here  -we can merge 2 tokens to get the new token hug.   - -41 -00:05:02,240 --> 00:05:09,840 -When we get to the end of our merge  -rule the tokenization is finished.   - -42 -00:05:10,640 --> 00:05:22,400 -ßAnd that's it, I hope that now the BPE  -algorithm has no more secret for you! +1 +00:00:00,125 --> 00:00:05,125 +(air whooshing) + +2 +00:00:05,190 --> 00:00:06,720 +- You are at the right place + +3 +00:00:06,720 --> 00:00:10,464 +if you want to understand +what the Byte Pair Encoding + +4 +00:00:10,464 --> 00:00:13,263 +subword tokenization algorithm is, + +5 +00:00:14,160 --> 00:00:15,505 +how to train it + +6 +00:00:15,505 --> 00:00:17,790 +and how the tokenization of a text is done + +7 +00:00:17,790 --> 00:00:19,107 +with this algorithm. + +8 +00:00:21,417 --> 00:00:22,920 +The BPE algorithm + +9 +00:00:22,920 --> 00:00:26,820 +was initially proposed as a +text compression algorithm + +10 +00:00:26,820 --> 00:00:28,770 +but it is also very well suited + +11 +00:00:28,770 --> 00:00:31,143 +as a tokenizer for your language models. + +12 +00:00:32,910 --> 00:00:34,890 +The idea of BPE is to divide words + +13 +00:00:34,890 --> 00:00:36,933 +into a sequence of 'subword units' + +14 +00:00:38,100 --> 00:00:41,970 +which are units that appear +frequently in a reference corpus + +15 +00:00:41,970 --> 00:00:44,613 +which is, the corpus we used to train it. + +16 +00:00:46,701 --> 00:00:49,083 +How is a BPE tokenizer trained? + +17 +00:00:50,100 --> 00:00:53,340 +First of all, we have to +get a corpus of texts. + +18 +00:00:53,340 --> 00:00:56,940 +We will not train our +tokenizer on this raw text + +19 +00:00:56,940 --> 00:00:59,490 +but we will first normalize it + +20 +00:00:59,490 --> 00:01:00,873 +then pre-tokenize it. + +21 +00:01:01,890 --> 00:01:03,240 +As the pre-tokenization + +22 +00:01:03,240 --> 00:01:05,790 +divides the text into a list of words, + +23 +00:01:05,790 --> 00:01:08,400 +we can represent our corpus in another way + +24 +00:01:08,400 --> 00:01:10,350 +by gathering together the same words + +25 +00:01:10,350 --> 00:01:12,450 +and by maintaining a counter, + +26 +00:01:12,450 --> 00:01:14,223 +here represented in blue. + +27 +00:01:17,340 --> 00:01:19,860 +To understand how the training works, + +28 +00:01:19,860 --> 00:01:23,730 +we consider this toy corpus +composed of the following words: + +29 +00:01:23,730 --> 00:01:28,203 +huggingface, hugging, hug, hugger, etc. + +30 +00:01:29,100 --> 00:01:32,640 +BPE is an algorithm that starts +with an initial vocabulary + +31 +00:01:32,640 --> 00:01:35,583 +and then increases it to the desired size. + +32 +00:01:36,450 --> 00:01:38,460 +To build the initial vocabulary, + +33 +00:01:38,460 --> 00:01:41,550 +we start by separating +each word of the corpus + +34 +00:01:41,550 --> 00:01:44,253 +into a list of elementary +units that compose them, + +35 +00:01:45,210 --> 00:01:47,013 +here, the characters. + +36 +00:01:50,850 --> 00:01:54,310 +We list in our vocabulary all +the characters that appear + +37 +00:01:55,218 --> 00:01:58,053 +and that will constitute +our initial vocabulary. + +38 +00:02:00,420 --> 00:02:02,523 +Let's now see how to increase it. + +39 +00:02:05,520 --> 00:02:08,250 +We return to our split corpus, + +40 +00:02:08,250 --> 00:02:11,340 +we will go through the words one by one + +41 +00:02:11,340 --> 00:02:14,313 +and count all the +occurrences of token pairs. + +42 +00:02:15,450 --> 00:02:18,397 +The first pair is composed +of the token 'h' and 'u', + +43 +00:02:20,130 --> 00:02:23,067 +the second 'u' and 'g', + +44 +00:02:23,067 --> 00:02:26,253 +and we continue like that until +we have the complete list. + +45 +00:02:35,580 --> 00:02:37,724 +Once we know all the pairs + +46 +00:02:37,724 --> 00:02:40,140 +and their frequency of appearance, + +47 +00:02:40,140 --> 00:02:42,940 +we will choose the one that +appears the most frequently. + +48 +00:02:44,220 --> 00:02:47,697 +Here it is the pair composed +of the letters 'l' and 'e'. + +49 +00:02:51,930 --> 00:02:53,590 +We note our first merging rule + +50 +00:02:54,593 --> 00:02:57,243 +and we add the new +token to our vocabulary. + +51 +00:03:00,330 --> 00:03:04,260 +We can then apply this +merging rule to our splits. + +52 +00:03:04,260 --> 00:03:07,350 +You can see that we have +merged all the pairs of tokens + +53 +00:03:07,350 --> 00:03:09,793 +composed of the tokens 'l' and 'e'. + +54 +00:03:14,008 --> 00:03:18,150 +And now, we just have to +reproduce the same steps + +55 +00:03:18,150 --> 00:03:19,353 +with our new splits. + +56 +00:03:21,750 --> 00:03:23,460 +We calculate the frequency of occurrence + +57 +00:03:23,460 --> 00:03:25,023 +of each pair of tokens, + +58 +00:03:27,990 --> 00:03:30,603 +we select the pair with +the highest frequency, + +59 +00:03:32,190 --> 00:03:34,083 +we note it in our merge rules, + +60 +00:03:36,000 --> 00:03:39,360 +we add the new one token the vocabulary + +61 +00:03:39,360 --> 00:03:41,880 +and then we merge all the pairs of tokens + +62 +00:03:41,880 --> 00:03:46,503 +composed of the token 'le' +and 'a' into our splits. + +63 +00:03:50,323 --> 00:03:51,960 +And we can repeat this operation + +64 +00:03:51,960 --> 00:03:54,843 +until we reach the +desired vocabulary size. + +65 +00:04:05,671 --> 00:04:10,671 +Here, we stopped when our +vocabulary reached 21 tokens. + +66 +00:04:11,040 --> 00:04:13,920 +We can see now that +the words of our corpus + +67 +00:04:13,920 --> 00:04:17,040 +are now divided into far fewer tokens + +68 +00:04:17,040 --> 00:04:20,280 +than at the beginning of the training. + +69 +00:04:20,280 --> 00:04:21,720 +And that our algorithm + +70 +00:04:21,720 --> 00:04:24,990 +has learned the radicals 'hug' and 'learn' + +71 +00:04:24,990 --> 00:04:27,537 +and also the verbal ending 'ing'. + +72 +00:04:29,880 --> 00:04:32,160 +Now that we have learned our vocabulary + +73 +00:04:32,160 --> 00:04:35,943 +and merging rules, we +can tokenize new texts. + +74 +00:04:37,980 --> 00:04:39,210 +For example, + +75 +00:04:39,210 --> 00:04:41,160 +if we want to tokenize the word 'hugs', + +76 +00:04:42,960 --> 00:04:46,680 +first we'll divide it +into elementary units + +77 +00:04:46,680 --> 00:04:48,843 +so it became a sequence of characters. + +78 +00:04:50,040 --> 00:04:52,020 +Then, we'll go through our merge rules + +79 +00:04:52,020 --> 00:04:54,690 +until we have one we can apply. + +80 +00:04:54,690 --> 00:04:57,930 +Here, we can merge the +letters 'h' and 'u'. + +81 +00:04:57,930 --> 00:05:01,467 +And here, we can merge 2 tokens +to get the new token 'hug'. + +82 +00:05:02,400 --> 00:05:05,760 +When we get to the end of our merge rules, + +83 +00:05:05,760 --> 00:05:07,563 +the tokenization is finished. + +84 +00:05:10,650 --> 00:05:11,727 +And that's it. + +85 +00:05:12,846 --> 00:05:14,850 +I hope that now the BPE algorithm + +86 +00:05:14,850 --> 00:05:16,413 +has no more secret for you! + +87 +00:05:17,739 --> 00:05:20,406 +(air whooshing) + diff --git a/subtitles/en/52_wordpiece-tokenization.srt b/subtitles/en/52_wordpiece-tokenization.srt index acfd9947d..fb0b3a571 100644 --- a/subtitles/en/52_wordpiece-tokenization.srt +++ b/subtitles/en/52_wordpiece-tokenization.srt @@ -1,154 +1,290 @@ -1 -00:00:05,520 --> 00:00:10,000 -Let's see together what is the training  -strategy of the WordPiece algorithm   - -2 -00:00:10,560 --> 00:00:15,920 -and how it performs the  -tokenization of a text once trained   - -3 -00:00:19,200 --> 00:00:25,280 -WordPiece is a tokenization algorithm introduced  -by Google. It is used for example by Bert.   - -4 -00:00:26,480 --> 00:00:30,640 -To our knowledge, the code of Word  -Pieces has not been open sourced,   - -5 -00:00:31,360 --> 00:00:36,640 -so we base our explanations on our own  -interpretation of the published literature.   - -6 -00:00:42,480 --> 00:00:48,480 -What is the training strategy of  -WordPiece? Similarly to the BPE algorithm,   - -7 -00:00:48,480 --> 00:00:54,480 -WordPiece starts by establishing an initial  -vocabulary composed of elementary units   - -8 -00:00:54,480 --> 00:01:01,760 -and then increases this vocabulary to the  -desired size. To build the initial vocabulary,   - -9 -00:01:01,760 --> 00:01:07,120 -we divide each word in the training corpus  -into the sequence of letters that make it up.   - -10 -00:01:08,240 --> 00:01:14,000 -As you can see, there is a small subtlety:  -we add a 2 hashtags in front of the letters   - -11 -00:01:14,000 --> 00:01:20,240 -that do not start a word. By keeping  -only one occurrence per elementary unit   - -12 -00:01:20,240 --> 00:01:29,440 -we now have our initial vocabulary. We will  -list all the existing pairs in our corpus.   - -13 -00:01:30,800 --> 00:01:34,960 -Once we have this list, we will calculate  -a score for each of these pairs.   - -14 -00:01:36,400 --> 00:01:40,400 -As for the BPE algorithm, we will  -select the pair with the highest score.   - -15 -00:01:43,040 --> 00:01:50,000 -Taking for example the first pair composed  -of H and U. The score of a pair is simply   - -16 -00:01:50,000 --> 00:01:54,720 -equal to the frequency of appearance of  -the pair divided by the product of the   - -17 -00:01:54,720 --> 00:01:59,840 -frequency of appearance of the first token by  -the frequency of appearance of the second token.   - -18 -00:02:01,120 --> 00:02:04,560 -Thus at a fixed frequency  -of appearance of the pair,   - -19 -00:02:05,360 --> 00:02:11,440 -if the subparts of the pair are very frequent  -in the corpus then this score will be decreased.   - -20 -00:02:12,960 --> 00:02:24,000 -In our example, the pair "hu" appears 4 times, the  -letter "h" 4 times and the letter u 4 times. This   - -21 -00:02:24,000 --> 00:02:32,320 -gives us a score of 0.25. Now that we know how to  -calculate this score, we can do it for all pairs.   - -22 -00:02:33,200 --> 00:02:36,480 -We can now add to the vocabulary  -the pair with the highest score,   - -23 -00:02:37,120 --> 00:02:43,520 -after merging it of course! And now we can  -apply this same fusion to our split corpus.   - -24 -00:02:45,600 --> 00:02:51,520 -As you can imagine, we just have to repeat the  -same operations until we have the vocabulary at   - -25 -00:02:51,520 --> 00:03:00,320 -the desired size! Let's look at a few more steps  -to see the evolution of our vocabulary and the   - -26 -00:03:00,320 --> 00:03:09,840 -length of the splits getting shorter. Now that we  -are happy with our vocabulary, you are probably   - -27 -00:03:09,840 --> 00:03:16,400 -wondering how to use it to tokenize a text. Let's  -say we want to tokenize the word "huggingface".   - -28 -00:03:17,760 --> 00:03:23,280 -WordPiece follows these rules: We will look for  -the longest possible token at the beginning of   - -29 -00:03:23,280 --> 00:03:30,560 -our word. Then we start again on the remaining  -part of our word. And so on until we reach the   - -30 -00:03:30,560 --> 00:03:38,240 -end! And that's it, huggingface is divided  -into 4 sub-tokens. ßThis video is about to   - -31 -00:03:38,240 --> 00:03:43,040 -end, I hope it helped you to understand  -better what is behind the word WordPiece! +1 +00:00:00,151 --> 00:00:02,818 +(air whooshing) + +2 +00:00:05,520 --> 00:00:08,370 +- Let's see together what +is the training strategy + +3 +00:00:08,370 --> 00:00:11,851 +of the WordPiece algorithm, +and how it performs + +4 +00:00:11,851 --> 00:00:15,150 +the tokenization of a text, once trained. + +5 +00:00:19,351 --> 00:00:23,580 +WordPiece is a tokenization +algorithm introduced by Google. + +6 +00:00:23,580 --> 00:00:25,653 +It is used, for example, by BERT. + +7 +00:00:26,640 --> 00:00:28,020 +To our knowledge, + +8 +00:00:28,020 --> 00:00:31,590 +the code of WordPiece +has not been open source. + +9 +00:00:31,590 --> 00:00:33,510 +So we base our explanations + +10 +00:00:33,510 --> 00:00:36,903 +on our own interpretation +of the published literature. + +11 +00:00:42,090 --> 00:00:44,883 +So, what is the training +strategy of WordPiece? + +12 +00:00:46,200 --> 00:00:48,663 +Similarly to the BPE algorithm, + +13 +00:00:48,663 --> 00:00:52,380 +WordPiece starts by establishing +an initial vocabulary + +14 +00:00:52,380 --> 00:00:54,660 +composed of elementary units, + +15 +00:00:54,660 --> 00:00:58,773 +and then increases this +vocabulary to the desired size. + +16 +00:00:59,970 --> 00:01:01,950 +To build the initial vocabulary, + +17 +00:01:01,950 --> 00:01:04,920 +we divide each word in the training corpus + +18 +00:01:04,920 --> 00:01:07,443 +into the sequence of +letters that make it up. + +19 +00:01:08,430 --> 00:01:11,820 +As you can see, there is a small subtlety. + +20 +00:01:11,820 --> 00:01:14,190 +We add two hashtags in +front of the letters + +21 +00:01:14,190 --> 00:01:16,083 +that do not start a word. + +22 +00:01:17,190 --> 00:01:20,430 +By keeping only one occurrence +per elementary unit, + +23 +00:01:20,430 --> 00:01:23,313 +we now have our initial vocabulary. + +24 +00:01:26,580 --> 00:01:29,823 +We will list all the +existing pairs in our corpus. + +25 +00:01:30,990 --> 00:01:32,640 +Once we have this list, + +26 +00:01:32,640 --> 00:01:35,253 +we will calculate a score +for each of these pairs. + +27 +00:01:36,630 --> 00:01:38,400 +As for the BPE algorithm, + +28 +00:01:38,400 --> 00:01:40,750 +we will select the pair +with the highest score. + +29 +00:01:43,260 --> 00:01:44,340 +Taking for example, + +30 +00:01:44,340 --> 00:01:47,343 +the first pair composed +of the letters H and U. + +31 +00:01:48,510 --> 00:01:51,390 +The score of a pair is +simply equal to the frequency + +32 +00:01:51,390 --> 00:01:54,510 +of appearance of the pair, +divided by the product + +33 +00:01:54,510 --> 00:01:57,330 +of the frequency of +appearance of the first token, + +34 +00:01:57,330 --> 00:02:00,063 +by the frequency of appearance +of the second token. + +35 +00:02:01,260 --> 00:02:05,550 +Thus, at a fixed frequency +of appearance of the pair, + +36 +00:02:05,550 --> 00:02:09,913 +if the subparts of the pair are +very frequent in the corpus, + +37 +00:02:09,913 --> 00:02:11,823 +then this score will be decreased. + +38 +00:02:13,140 --> 00:02:17,460 +In our example, the pair +HU appears four times, + +39 +00:02:17,460 --> 00:02:22,460 +the letter H four times, +and the letter U four times. + +40 +00:02:24,030 --> 00:02:26,733 +This gives us a score of 0.25. + +41 +00:02:28,410 --> 00:02:30,960 +Now that we know how to +calculate this score, + +42 +00:02:30,960 --> 00:02:33,360 +we can do it for all pairs. + +43 +00:02:33,360 --> 00:02:35,217 +We can now add to the vocabulary + +44 +00:02:35,217 --> 00:02:38,973 +the pair with the highest score, +after merging it of course. + +45 +00:02:40,140 --> 00:02:43,863 +And now we can apply this same +fusion to our split corpus. + +46 +00:02:45,780 --> 00:02:47,490 +As you can imagine, + +47 +00:02:47,490 --> 00:02:50,130 +we just have to repeat the same operations + +48 +00:02:50,130 --> 00:02:53,013 +until we have the vocabulary +at the desired size. + +49 +00:02:54,000 --> 00:02:55,800 +Let's look at a few more steps + +50 +00:02:55,800 --> 00:02:58,113 +to see the evolution of our vocabulary, + +51 +00:02:58,957 --> 00:03:01,773 +and also the evolution of +the length of the splits. + +52 +00:03:06,390 --> 00:03:09,180 +And now that we are happy +with our vocabulary, + +53 +00:03:09,180 --> 00:03:12,663 +you are probably wondering how +to use it to tokenize a text. + +54 +00:03:13,830 --> 00:03:17,640 +Let's say we want to tokenize +the word "huggingface". + +55 +00:03:17,640 --> 00:03:20,310 +WordPiece follows these rules. + +56 +00:03:20,310 --> 00:03:22,530 +We will look for the +longest possible token + +57 +00:03:22,530 --> 00:03:24,960 +at the beginning of the word. + +58 +00:03:24,960 --> 00:03:28,920 +Then we start again on the +remaining part of our word, + +59 +00:03:28,920 --> 00:03:31,143 +and so on until we reach the end. + +60 +00:03:32,100 --> 00:03:35,973 +And that's it. Huggingface is +divided into four sub-tokens. + +61 +00:03:37,200 --> 00:03:39,180 +This video is about to end. + +62 +00:03:39,180 --> 00:03:41,370 +I hope it helped you to understand better + +63 +00:03:41,370 --> 00:03:43,653 +what is behind the work, WordPiece. + +64 +00:03:45,114 --> 00:03:47,864 +(air whooshing) + diff --git a/subtitles/en/53_unigram-tokenization.srt b/subtitles/en/53_unigram-tokenization.srt index 265b9eee1..cc6bae91e 100644 --- a/subtitles/en/53_unigram-tokenization.srt +++ b/subtitles/en/53_unigram-tokenization.srt @@ -1,444 +1,707 @@ -1 -00:00:05,330 --> 00:00:11,090 -In this video, we will study together "the -Unigram Language Model subword tokenization - -2 -00:00:11,090 --> 00:00:12,090 -algorithm". - -3 -00:00:12,090 --> 00:00:20,080 -The overall training strategy of a Unigram -LM tokenizer is to start with a very large - -4 -00:00:20,080 --> 00:00:27,439 -vocabulary and then to remove tokens at each -iteration until we reach the desired size. - -5 -00:00:27,439 --> 00:00:32,250 -At each iteration, we will calculate a loss -on our training corpus thanks to the Unigram - -6 -00:00:32,250 --> 00:00:33,250 -model. - -7 -00:00:33,250 --> 00:00:39,160 -As the loss calculation depends on the available -vocabulary, we can use it to choose how to - -8 -00:00:39,160 --> 00:00:41,590 -reduce the vocabulary. - -9 -00:00:41,590 --> 00:00:48,090 -So we look at the evolution of the loss by -removing in turn each token from the vocabulary. - -10 -00:00:48,090 --> 00:00:56,730 -We will choose to remove the p percents which -increase the loss the less. - -11 -00:00:56,730 --> 00:01:01,030 -Before going further in the explanation of -the training algorithm, I need to explain - -12 -00:01:01,030 --> 00:01:04,199 -what is an Unigram model. - -13 -00:01:04,199 --> 00:01:08,119 -The Unigram LM model is a type of statistical -Language Modem. - -14 -00:01:08,119 --> 00:01:15,550 -A statistical LM will assign a probability -to a text considering that the text is in - -15 -00:01:15,550 --> 00:01:18,189 -fact a sequence of tokens. - -16 -00:01:18,189 --> 00:01:23,900 -The simplest sequences of tokens to imagine -are the words that compose the sentence or - -17 -00:01:23,900 --> 00:01:25,410 -the characters. - -18 -00:01:25,410 --> 00:01:32,080 -The particularity of Unigram LM is that it -assumes that the occurrence of each word is - -19 -00:01:32,080 --> 00:01:34,670 -independent of its previous word. - -20 -00:01:34,670 --> 00:01:40,271 -This "assumption" allows us to write that -the probability of a text is equal to the - -21 -00:01:40,271 --> 00:01:44,430 -product of the probabilities of the tokens -that compose it. - -22 -00:01:44,430 --> 00:01:51,880 -It should be noted here that this is a very -simple model which would not be adapted to - -23 -00:01:51,880 --> 00:01:58,630 -the generation of text since this model would -always generate the same token, the one which - -24 -00:01:58,630 --> 00:02:00,140 -has the greatest probability. - -25 -00:02:00,140 --> 00:02:07,409 -Nevertheless, to do tokenization, this model -is very useful to us because it can be used - -26 -00:02:07,409 --> 00:02:14,209 -to estimate the relative likelihood of different -phrases. - -27 -00:02:14,209 --> 00:02:20,000 -We are now ready to return to our explanation -of the training algorithm. - -28 -00:02:20,000 --> 00:02:25,349 -Let's say that we have as a training corpus -10 times the word hug, 12 times the word pug, - -29 -00:02:25,349 --> 00:02:33,270 -5 times the word lug, 4 times bug and 5 times -dug. - -30 -00:02:33,270 --> 00:02:38,910 -As said at the beginning of the video, the -training starts with a big vocabulary. - -31 -00:02:38,910 --> 00:02:45,280 -Obviously, as we are using a toy corpus, this -vocabulary will not be that big but it should - -32 -00:02:45,280 --> 00:02:46,840 -show you the principle. - -33 -00:02:46,840 --> 00:02:54,870 -A first method is to list all the possible -strict substrings that's what we'll do here. - -34 -00:02:54,870 --> 00:03:00,379 -We could also have used the BPE algorithm -with a very large vocabulary size. - -35 -00:03:00,379 --> 00:03:07,200 -So we have our initial vocabulary. - -36 -00:03:07,200 --> 00:03:13,629 -The training of the Unigram tokenizer is based -on the Expectation-Maximization method: at - -37 -00:03:13,629 --> 00:03:15,210 -each iteration. - -38 -00:03:15,210 --> 00:03:19,190 -We estimate the probabilities of the tokens -of the vocabulary. - -39 -00:03:19,190 --> 00:03:26,430 -Then we remove the p percent of tokens that -minimize the loss on the corpus and which - -40 -00:03:26,430 --> 00:03:33,500 -do not belong to the basic characters as we -want to keep in our final vocabulary the basic - -41 -00:03:33,500 --> 00:03:37,980 -characters to be able to tokenize any word. - -42 -00:03:37,980 --> 00:03:39,230 -Let's go for it! - -43 -00:03:39,230 --> 00:03:44,660 -The probability of a token is simply estimated -by the number of appearance of this token - -44 -00:03:44,660 --> 00:03:51,590 -in our training corpus divided by the total -number of appearance of all the tokens. - -45 -00:03:51,590 --> 00:03:57,239 -We could use this vocabulary to tokenize our -words according to the unigram model. - -46 -00:03:57,239 --> 00:04:04,080 -We will do it together to understand two things: -how we tokenize a word with a Unigram model - -47 -00:04:04,080 --> 00:04:09,160 -and how the loss is calculated on our corpus. - -48 -00:04:09,160 --> 00:04:14,610 -The Unigram LM tokenization of our text "Hug" -will be the one with the highest probability - -49 -00:04:14,610 --> 00:04:19,140 -of occurrence according to our Unigram model. - -50 -00:04:19,140 --> 00:04:24,090 -To find it, the simplest way to proceed would -be to list all the possible segmentations - -51 -00:04:24,090 --> 00:04:29,949 -of our text "Hug", calculate the probability -of each of these segmentations and then choose - -52 -00:04:29,949 --> 00:04:32,490 -the one with the highest probability. - -53 -00:04:32,490 --> 00:04:38,630 -With the current vocabulary, 2 tokenizations -get exactly the same probability. - -54 -00:04:38,630 --> 00:04:43,789 -So we choose one of them and keep in memory -the associated probability. - -55 -00:04:43,789 --> 00:04:48,850 -To compute the loss on our training corpus, -we need to tokenize as we just did all the - -56 -00:04:48,850 --> 00:04:52,810 -remaining words in the corpus. - -57 -00:04:52,810 --> 00:04:57,930 -The loss is then the sum over all the words -in the corpus of the frequency of occurrence - -58 -00:04:57,930 --> 00:05:04,220 -of the word multiplied by the opposite of -the log of the probability associated with - -59 -00:05:04,220 --> 00:05:07,720 -the tokenization of the word. - -60 -00:05:07,720 --> 00:05:12,700 -We obtain here a loss of one hundred and seventy. - -61 -00:05:12,700 --> 00:05:18,750 -Remember, our initial goal was to reduce the -vocabulary. - -62 -00:05:18,750 --> 00:05:27,810 -To do this, we will remove a token from the -vocabulary and calculate the associated loss. - -63 -00:05:27,810 --> 00:05:32,020 -Let's remove for example the token 'ug'. - -64 -00:05:32,020 --> 00:05:38,569 -We notice that the tokenization for "hug" -with the letter h and the tuple ug is now - -65 -00:05:38,569 --> 00:05:39,970 -impossible. - -66 -00:05:39,970 --> 00:05:45,810 -Nevertheless, as we saw earlier that two tokenizations -had the same probability and we can still - -67 -00:05:45,810 --> 00:05:50,870 -choose the remaining tokenization with a probability -of one point ten minus two. - -68 -00:05:50,870 --> 00:05:58,210 -The tokenizations of the other words of the -vocabulary also remain unchanged and finally - -69 -00:05:58,210 --> 00:06:06,710 -even if we remove the token "ug" from our -vocabulary the loss remains equal to 170. - -70 -00:06:06,710 --> 00:06:11,550 -For this first iteration, if we continue the -calculation, we would notice that we could - -71 -00:06:11,550 --> 00:06:16,190 -remove any token without it impacting the -loss. - -72 -00:06:16,190 --> 00:06:24,620 -We will therefore choose at random to remove -the token "ug" before starting a second iteration. - -73 -00:06:24,620 --> 00:06:29,600 -We estimate again the probability of each -token before calculating the impact of each - -74 -00:06:29,600 --> 00:06:32,280 -token on the loss. - -75 -00:06:32,280 --> 00:06:37,840 -For example, if we remove now the token composed -of the letters "h" and "u", there is only - -76 -00:06:37,840 --> 00:06:42,020 -one possible tokenization left for hug. - -77 -00:06:42,020 --> 00:06:46,580 -The tokenization of the other words of the -vocabulary is not changed. - -78 -00:06:46,580 --> 00:06:51,880 -In the end, we obtain by removing the token -composed of the letters "h" and "u" from the - -79 -00:06:51,880 --> 00:06:54,650 -vocabulary a loss of one hundred and sixty-eight. - -80 -00:06:54,650 --> 00:07:02,550 -Finally, to choose which token to remove, -we will for each remaining token of the vocabulary - -81 -00:07:02,550 --> 00:07:10,090 -which is not an elementary token calculate -the associated loss then compare these losses - -82 -00:07:10,090 --> 00:07:11,850 -between them. - -83 -00:07:11,850 --> 00:07:18,100 -The token which we will remove is the token -which impacts the least the loss: here the - -84 -00:07:18,100 --> 00:07:20,129 -token "bu". - -85 -00:07:20,129 --> 00:07:25,710 -We had mentioned at the beginning of the video -that at each iteration we could remove p % of - -86 -00:07:25,710 --> 00:07:29,540 -the tokens by iteration. - -87 -00:07:29,540 --> 00:07:35,850 -The second token that could be removed at -this iteration is the "du" token. - -88 -00:07:35,850 --> 00:07:42,690 -And that's it, we just have to repeat these -steps until we get the vocabulary of the desired - -89 -00:07:42,690 --> 00:07:45,240 -size. - -90 -00:07:45,240 --> 00:07:51,129 -One last thing, in practice, when we tokenize -a word with a Unigram model we don't compute - -91 -00:07:51,129 --> 00:07:57,210 -the set of probabilities of the possible splits -of a word before comparing them to keep the - -92 -00:07:57,210 --> 00:08:05,560 -best one but we use the Viterbi algorithm -which is much more efficient. - -93 -00:08:05,560 --> 00:08:07,300 -And that's it! - -94 -00:08:07,300 --> 00:08:15,000 -I hope that this example has allowed you to -better understand the Unigram tokenization - -95 -00:08:15,000 --> 00:08:18,190 -algorithm. +1 +00:00:00,000 --> 00:00:02,667 +(air whooshing) + +2 +00:00:05,310 --> 00:00:06,420 +- In this video, + +3 +00:00:06,420 --> 00:00:09,881 +we will study together +'the Unigram Language Model + +4 +00:00:09,881 --> 00:00:13,288 +subword tokenization algorithm'. + +5 +00:00:13,288 --> 00:00:15,567 +The overall training strategy + +6 +00:00:15,567 --> 00:00:18,450 +of a Unigram Language Model tokenizer + +7 +00:00:18,450 --> 00:00:21,480 +is to start with a very large vocabulary + +8 +00:00:21,480 --> 00:00:24,240 +and then to remove +tokens at each iteration + +9 +00:00:24,240 --> 00:00:27,300 +until we reach the desired size. + +10 +00:00:27,300 --> 00:00:28,530 +At each iteration, + +11 +00:00:28,530 --> 00:00:30,930 +we will calculate a loss +on our training corpus + +12 +00:00:30,930 --> 00:00:33,480 +thanks to the Unigram model. + +13 +00:00:33,480 --> 00:00:37,470 +As the loss calculation depends +on the available vocabulary, + +14 +00:00:37,470 --> 00:00:40,563 +we can use it to choose how +to reduce the vocabulary. + +15 +00:00:41,550 --> 00:00:43,620 +So we look at the evolution of the loss + +16 +00:00:43,620 --> 00:00:47,103 +by removing in turn each +token from the vocabulary. + +17 +00:00:48,000 --> 00:00:50,430 +We will choose to remove the p-percents + +18 +00:00:50,430 --> 00:00:52,200 +which increase the loss the less. + +19 +00:00:56,310 --> 00:00:57,540 +Before going further + +20 +00:00:57,540 --> 00:01:00,240 +in the explanation of +the training algorithm, + +21 +00:01:00,240 --> 00:01:02,973 +I need to explain what +is an Unigram model. + +22 +00:01:04,183 --> 00:01:06,030 +The Unigram Language Model + +23 +00:01:06,030 --> 00:01:08,493 +is a type of Statistical Language Modem. + +24 +00:01:09,450 --> 00:01:10,980 +A Statistical Language Model + +25 +00:01:10,980 --> 00:01:13,530 +will assign a probability to a text + +26 +00:01:13,530 --> 00:01:18,090 +considering that the text is +in fact a sequence of tokens. + +27 +00:01:18,090 --> 00:01:21,090 +The simplest sequences +of tokens to imagine + +28 +00:01:21,090 --> 00:01:24,753 +are the words that compose the +sentence or the characters. + +29 +00:01:26,130 --> 00:01:28,890 +The particularity of +Unigram Language Model + +30 +00:01:28,890 --> 00:01:32,010 +is that it assumes that +the occurrence of each word + +31 +00:01:32,010 --> 00:01:34,533 +is independent of its previous word. + +32 +00:01:35,400 --> 00:01:37,620 +This assumption allows us to write + +33 +00:01:37,620 --> 00:01:39,570 +that the probability of a text + +34 +00:01:39,570 --> 00:01:42,210 +is equal to the product +of the probabilities + +35 +00:01:42,210 --> 00:01:43,953 +of the tokens that compose it. + +36 +00:01:45,840 --> 00:01:50,220 +It should be noted here that +it is a very simple model + +37 +00:01:50,220 --> 00:01:53,850 +which would not be adapted +to the generation of text + +38 +00:01:53,850 --> 00:01:57,840 +since this model would always +generate the same token, + +39 +00:01:57,840 --> 00:02:00,453 +the one which has the +greatest probability. + +40 +00:02:01,320 --> 00:02:03,360 +Nevertheless, to do tokenization, + +41 +00:02:03,360 --> 00:02:05,790 +this model is very useful to us + +42 +00:02:05,790 --> 00:02:07,440 +because it can be used + +43 +00:02:07,440 --> 00:02:10,893 +to estimate the relative +likelihood of different phrases. + +44 +00:02:14,100 --> 00:02:15,000 +We are now ready + +45 +00:02:15,000 --> 00:02:19,830 +to return to our explanation +of the training algorithm. + +46 +00:02:19,830 --> 00:02:21,690 +Let's say that we have +as a training corpus + +47 +00:02:21,690 --> 00:02:23,880 +with 10 times the word hug, + +48 +00:02:23,880 --> 00:02:25,410 +12 times the word pug, + +49 +00:02:25,410 --> 00:02:27,330 +5 times the word lug, + +50 +00:02:27,330 --> 00:02:28,560 +4 times bug + +51 +00:02:28,560 --> 00:02:29,943 +and 5 times dug. + +52 +00:02:33,120 --> 00:02:34,560 +As said earlier, + +53 +00:02:34,560 --> 00:02:37,473 +the training starts with a big vocabulary. + +54 +00:02:38,460 --> 00:02:41,400 +Obviously, as we are using a toy corpus, + +55 +00:02:41,400 --> 00:02:44,430 +this vocabulary will not be that big + +56 +00:02:44,430 --> 00:02:46,773 +but it should show you the principle. + +57 +00:02:47,610 --> 00:02:51,870 +A first method is to list all +the possible strict substrings + +58 +00:02:51,870 --> 00:02:53,823 +and that's what we'll do here. + +59 +00:02:54,780 --> 00:02:58,170 +We could also have used the BPE algorithm + +60 +00:02:58,170 --> 00:03:00,010 +with a very large vocabulary size + +61 +00:03:01,410 --> 00:03:05,103 +but for now, the strict +substrings are enough. + +62 +00:03:06,990 --> 00:03:09,120 +The training of the Unigram tokenizer + +63 +00:03:09,120 --> 00:03:12,093 +is based on the +Expectation-Maximization method. + +64 +00:03:13,320 --> 00:03:15,120 +At each iteration, + +65 +00:03:15,120 --> 00:03:17,430 +we estimate the +probabilities of the tokens + +66 +00:03:17,430 --> 00:03:18,430 +of the vocabulary + +67 +00:03:20,130 --> 00:03:23,100 +and then we remove the p-percent of tokens + +68 +00:03:23,100 --> 00:03:26,070 +that minimize the loss on the corpus + +69 +00:03:26,070 --> 00:03:28,900 +and which do not belong +to the basic character + +70 +00:03:29,880 --> 00:03:33,150 +as we want to keep in our final vocabulary + +71 +00:03:33,150 --> 00:03:36,693 +the basic characters to be +able to tokenize any word. + +72 +00:03:37,770 --> 00:03:39,641 +Let's go for it! + +73 +00:03:39,641 --> 00:03:42,360 +The probability of a +token simply estimated + +74 +00:03:42,360 --> 00:03:44,760 +by the number of appearance of this token + +75 +00:03:44,760 --> 00:03:46,440 +in our training corpus + +76 +00:03:46,440 --> 00:03:50,133 +divided by the total number of +appearance of all the tokens. + +77 +00:03:51,510 --> 00:03:54,390 +We could use this vocabulary +to tokenize our words + +78 +00:03:54,390 --> 00:03:56,283 +according to the Unigram model. + +79 +00:03:57,150 --> 00:04:00,892 +We will do it together +to understand two things: + +80 +00:04:00,892 --> 00:04:04,110 +how we tokenize a word +with a Unigram model + +81 +00:04:04,110 --> 00:04:07,803 +and how the loss is +calculated on our corpus. + +82 +00:04:09,088 --> 00:04:12,263 +The Unigram LM tokenization +of our text 'Hug' + +83 +00:04:12,263 --> 00:04:15,270 +will be the one with the highest +probability of occurrence + +84 +00:04:15,270 --> 00:04:17,403 +according to our Unigram model. + +85 +00:04:19,080 --> 00:04:21,750 +To find it, the simplest way to proceed + +86 +00:04:21,750 --> 00:04:24,120 +would be to list all the +possible segmentations + +87 +00:04:24,120 --> 00:04:25,800 +of our text 'Hug', + +88 +00:04:25,800 --> 00:04:29,340 +calculate the probability of +each of these segmentations + +89 +00:04:29,340 --> 00:04:32,043 +and then choose the one with +the highest probability. + +90 +00:04:33,210 --> 00:04:34,920 +With the current vocabulary, + +91 +00:04:34,920 --> 00:04:38,640 +two tokenizations get +exactly the same probability. + +92 +00:04:38,640 --> 00:04:40,080 +So we choose one of them + +93 +00:04:40,080 --> 00:04:42,603 +and keep in memory the +associated probability. + +94 +00:04:43,710 --> 00:04:46,380 +To compute the loss on +our training corpus, + +95 +00:04:46,380 --> 00:04:48,570 +we need to tokenize as we just did + +96 +00:04:48,570 --> 00:04:50,673 +all the remaining words in the corpus. + +97 +00:04:52,290 --> 00:04:56,430 +The loss is then the sum over +all the words in the corpus + +98 +00:04:56,430 --> 00:04:58,920 +of the frequency of occurrence of the word + +99 +00:04:58,920 --> 00:05:02,670 +multiplied by the opposite +of the log of the probability + +100 +00:05:02,670 --> 00:05:05,463 +associated with the +tokenization of the word. + +101 +00:05:07,620 --> 00:05:10,803 +We obtain here a loss of 170. + +102 +00:05:13,830 --> 00:05:18,630 +Remember, our initial goal +was to reduce the vocabulary. + +103 +00:05:18,630 --> 00:05:21,870 +To do this, we will remove +a token from the vocabulary + +104 +00:05:21,870 --> 00:05:24,213 +and calculate the associated loss. + +105 +00:05:27,630 --> 00:05:30,627 +Let's remove for example, the token 'ug'. + +106 +00:05:31,920 --> 00:05:35,370 +We notice that the tokenization for 'hug' + +107 +00:05:35,370 --> 00:05:39,990 +with the letter 'h' and the +tuple 'ug' is now impossible. + +108 +00:05:39,990 --> 00:05:42,240 +Nevertheless, as we saw earlier + +109 +00:05:42,240 --> 00:05:45,180 +that two tokenizations +had the same probability, + +110 +00:05:45,180 --> 00:05:47,730 +we can still choose the +remaining tokenization + +111 +00:05:47,730 --> 00:05:51,093 +with a probability of 1.10e-2. + +112 +00:05:52,410 --> 00:05:55,350 +The tokenizations of the +other words of the vocabulary + +113 +00:05:55,350 --> 00:05:57,060 +also remain unchanged. + +114 +00:05:57,060 --> 00:06:00,600 +And finally, even if we +remove the token 'ug' + +115 +00:06:00,600 --> 00:06:05,403 +from our vocabulary the +loss remains equal to 170. + +116 +00:06:06,630 --> 00:06:08,100 +For this first iteration, + +117 +00:06:08,100 --> 00:06:10,080 +if we continue the calculation, + +118 +00:06:10,080 --> 00:06:13,050 +we would notice that we +could remove any token + +119 +00:06:13,050 --> 00:06:16,110 +without it impacting the loss. + +120 +00:06:16,110 --> 00:06:19,200 +We will therefore choose at +random to remove the token 'ug' + +121 +00:06:19,200 --> 00:06:21,843 +before starting a second iteration. + +122 +00:06:24,240 --> 00:06:27,300 +So we estimate again the +probability of each token + +123 +00:06:27,300 --> 00:06:30,630 +before calculating the impact +of each token on the loss. + +124 +00:06:32,160 --> 00:06:33,990 +For example, if we remove now + +125 +00:06:33,990 --> 00:06:36,290 +the token composed of +the letters 'h' and 'u', + +126 +00:06:37,350 --> 00:06:41,013 +there is only one possible +tokenization left for hug. + +127 +00:06:41,940 --> 00:06:44,700 +The tokenization of the +other words of the vocabulary + +128 +00:06:44,700 --> 00:06:45,633 +is not changed. + +129 +00:06:46,560 --> 00:06:47,393 +In the end, + +130 +00:06:47,393 --> 00:06:49,200 +we obtain by removing the token + +131 +00:06:49,200 --> 00:06:52,749 +composed of the letters 'h' +and 'u' from the vocabulary, + +132 +00:06:52,749 --> 00:06:56,430 +a loss of 168. + +133 +00:06:56,430 --> 00:06:59,490 +Finally, to choose which token to remove, + +134 +00:06:59,490 --> 00:07:02,490 +we will for each remaining +token of the vocabulary, + +135 +00:07:02,490 --> 00:07:04,800 +which is not an elementary token, + +136 +00:07:04,800 --> 00:07:07,380 +calculate the associated loss. + +137 +00:07:07,380 --> 00:07:09,843 +Then, compare these losses between them. + +138 +00:07:11,730 --> 00:07:13,800 +The token which we will remove + +139 +00:07:13,800 --> 00:07:17,340 +is the token which impacts +the least the loss, + +140 +00:07:17,340 --> 00:07:18,870 +here the token 'bu'. + +141 +00:07:20,040 --> 00:07:22,380 +We had mentioned at the +beginning of the video + +142 +00:07:22,380 --> 00:07:24,930 +that at each iteration we could remove + +143 +00:07:24,930 --> 00:07:27,093 +p-percent of the tokens by iteration. + +144 +00:07:29,356 --> 00:07:33,000 +The second token that could +be removed at this iteration + +145 +00:07:33,000 --> 00:07:34,317 +is the token 'du'. + +146 +00:07:36,510 --> 00:07:37,920 +And that's it. + +147 +00:07:37,920 --> 00:07:39,720 +We just have to repeat these steps + +148 +00:07:39,720 --> 00:07:43,203 +until we get the vocabulary +of the desired size. + +149 +00:07:45,030 --> 00:07:46,500 +One last thing. + +150 +00:07:46,500 --> 00:07:50,310 +In practice, when we tokenize +a word with a Unigram model, + +151 +00:07:50,310 --> 00:07:53,130 +we don't compute the +set of probabilities of + +152 +00:07:53,130 --> 00:07:55,500 +all the possible splits of a word + +153 +00:07:55,500 --> 00:07:58,770 +before comparing them to keep the best one + +154 +00:07:58,770 --> 00:08:01,440 +but we use the Viterbi algorithm + +155 +00:08:01,440 --> 00:08:04,563 +which is much more efficient way to do it. + +156 +00:08:06,540 --> 00:08:07,680 +And that's it! + +157 +00:08:07,680 --> 00:08:09,270 +I hope that this example + +158 +00:08:09,270 --> 00:08:10,987 +has allowed you to better understand + +159 +00:08:10,987 --> 00:08:12,933 +the Unigram tokenization algorithm. + +160 +00:08:14,355 --> 00:08:17,022 +(air whooshing) + diff --git a/subtitles/en/54_building-a-new-tokenizer.srt b/subtitles/en/54_building-a-new-tokenizer.srt index e38c6b749..73d6ff9c9 100644 --- a/subtitles/en/54_building-a-new-tokenizer.srt +++ b/subtitles/en/54_building-a-new-tokenizer.srt @@ -1,245 +1,396 @@ -1 -00:00:05,350 --> 00:00:11,360 -In this video we will see how you can create -your own tokenizer from scratch! - -2 -00:00:11,360 --> 00:00:18,370 -To create your own tokenizer you will have -to think about each of the operations involved - -3 -00:00:18,370 --> 00:00:25,220 -in tokenization, namely: normalization, pre-tokenization, -model, post-processing and decoding. - -4 -00:00:25,220 --> 00:00:32,310 -If you don't know what normalization, pre-tokenization -and the model are, I advise you to go and - -5 -00:00:32,310 --> 00:00:34,800 -see the videos linked below. - -6 -00:00:34,800 --> 00:00:40,329 -The post processing gathers all the modifications -that we will carry out on the tokenized text. - -7 -00:00:40,329 --> 00:00:46,690 -It can include the addition of special tokens, -the creation of an attention mask but also - -8 -00:00:46,690 --> 00:00:50,200 -the generation of a list of token ids. - -9 -00:00:50,200 --> 00:00:55,350 -The decoding operation occurs at the very -end and will allow passing from the sequence - -10 -00:00:55,350 --> 00:00:59,000 -of ids in a sentence. - -11 -00:00:59,000 --> 00:01:04,220 -For example, in our example, we can see that -the hashtags have been removed and the tokens - -12 -00:01:04,220 --> 00:01:10,820 -composing the word "today" have been grouped -together. - -13 -00:01:10,820 --> 00:01:17,472 -In a fast tokenizer, all these components -are gathered in the backend_tokenizer attribute. - -14 -00:01:17,472 --> 00:01:22,720 -As you can see with this small code snippet, -it is an instance of a tokenizer from the - -15 -00:01:22,720 --> 00:01:24,860 -tokenizers library. - -16 -00:01:24,860 --> 00:01:33,799 -So, to create your own transformers tokenizer -you will have to follow these steps: first - -17 -00:01:33,799 --> 00:01:40,510 -create a training dataset; second create and -train a tokenizer with the tokenizers library - -18 -00:01:40,510 --> 00:01:49,430 -and third load this tokenizer into transformers -tokenizer. - -19 -00:01:49,430 --> 00:01:56,510 -To understand these steps, I propose that -we recreate a BERT tokenizer. - -20 -00:01:56,510 --> 00:01:59,500 -The first thing to do is to create a dataset. - -21 -00:01:59,500 --> 00:02:05,650 -With this code snippet you can create an iterator -on the dataset wikitext-2-raw-v1 which is - -22 -00:02:05,650 --> 00:02:08,610 -a rather small dataset in English. - -23 -00:02:08,610 --> 00:02:18,830 -We attack here the big part: the design of -our tokenizer with the tokenizers library. - -24 -00:02:18,830 --> 00:02:25,349 -We start by initializing a tokenizer instance -with a WordPiece model because it is the model - -25 -00:02:25,349 --> 00:02:29,240 -used by BERT. - -26 -00:02:29,240 --> 00:02:32,110 -Then we can define our normalizer. - -27 -00:02:32,110 --> 00:02:39,930 -We will define it as a succession of 2 normalizations -used to clean up characters not visible in - -28 -00:02:39,930 --> 00:02:46,659 -the text, 1 lowercasing normalization and -2 normalizations used to remove accents. - -29 -00:02:46,659 --> 00:02:54,459 -For the pre-tokenization, we will chain two -pre_tokenizer. - -30 -00:02:54,459 --> 00:02:59,959 -The first one separating the text at the level -of spaces and the second one isolating the - -31 -00:02:59,959 --> 00:03:02,450 -punctuation marks. - -32 -00:03:02,450 --> 00:03:08,430 -Now, we can define the trainer that will allow -us to train the WordPiece model chosen at - -33 -00:03:08,430 --> 00:03:11,209 -the beginning. - -34 -00:03:11,209 --> 00:03:17,280 -To carry out the training, we will have to -choose a vocabulary size, here we choose twenty-five - -35 -00:03:17,280 --> 00:03:29,099 -thousand and also announce the special tokens -that we absolutely want to add to our vocabulary. - -36 -00:03:29,099 --> 00:03:39,209 -In one line of code, we can train our WordPiece -model using the iterator we defined earlier. - -37 -00:03:39,209 --> 00:03:45,800 -Once the model has been trained, we can retrieve -the ids of the special class and separation - -38 -00:03:45,800 --> 00:03:49,750 -tokens because we will need them to post-process -our sequence. - -39 -00:03:49,750 --> 00:03:55,790 -Thanks to the TemplateProcessing class, we -can add the CLS token at the beginning of - -40 -00:03:55,790 --> 00:04:01,780 -each sequence and the SEP token at the end -of the sequence and between two sentences - -41 -00:04:01,780 --> 00:04:07,060 -if we tokenize a text pair. - -42 -00:04:07,060 --> 00:04:12,099 -Finally, we just have to define our decoder -which will allow us to remove the hashtags - -43 -00:04:12,099 --> 00:04:17,810 -at the beginning of the tokens that must be -reattached to the previous token. - -44 -00:04:17,810 --> 00:04:30,930 -And there it ist, you have all the necessary -lines of code to define your own tokenizer. - -45 -00:04:30,930 --> 00:04:35,120 -Now that we have a brand new tokenizer with -the tokenizers library we just have to load - -46 -00:04:35,120 --> 00:04:40,070 -it into a fast tokenizer from the transformers -library. - -47 -00:04:40,070 --> 00:04:42,660 -Here again we have several possibilities. - -48 -00:04:42,660 --> 00:04:48,830 -We can load it in the generic class "PreTrainedTokenizerFast" -or in the BertTokenizerFast class since we - -49 -00:04:48,830 --> 00:04:56,380 -have built a bert type tokenizer here. - -50 -00:04:56,380 --> 00:05:01,600 -I hope this video has helped you understand -how you can create your own tokenizer and - -51 -00:05:01,600 --> 00:05:10,669 -that you are ready to navigate the tokenizers -library documentation to choose the components - -52 -00:05:10,669 --> 00:05:16,490 -for your brand-new tokenizer! +1 +00:00:00,188 --> 00:00:02,855 +(air whooshing) + +2 +00:00:05,400 --> 00:00:07,500 +In this video, we will see how + +3 +00:00:07,500 --> 00:00:11,310 +you can create your own +tokenizer from scratch. + +4 +00:00:11,310 --> 00:00:15,000 +To create your own tokenizer, +you will have to think about + +5 +00:00:15,000 --> 00:00:18,180 +each of the operations +involved in tokenization. + +6 +00:00:18,180 --> 00:00:22,440 +Namely, the normalization, +the pre-tokenization, + +7 +00:00:22,440 --> 00:00:25,233 +the model, the post +processing, and the decoding. + +8 +00:00:26,100 --> 00:00:28,350 +If you don't know what normalization, + +9 +00:00:28,350 --> 00:00:30,900 +pre-tokenization, and the model are, + +10 +00:00:30,900 --> 00:00:34,531 +I advise you to go and see +the videos linked below. + +11 +00:00:34,531 --> 00:00:37,110 +The post processing gathers +all the modifications + +12 +00:00:37,110 --> 00:00:40,860 +that we will carry out +on the tokenized text. + +13 +00:00:40,860 --> 00:00:43,890 +It can include the +addition of special tokens, + +14 +00:00:43,890 --> 00:00:46,290 +the creation of an intention mask, + +15 +00:00:46,290 --> 00:00:48,903 +but also the generation +of a list of token IDs. + +16 +00:00:50,220 --> 00:00:53,487 +The decoding operation +occurs at the very end, + +17 +00:00:53,487 --> 00:00:54,660 +and will allow passing + +18 +00:00:54,660 --> 00:00:57,753 +from the sequence of IDs in a sentence. + +19 +00:00:58,890 --> 00:01:01,800 +For example, you can see that the hashtags + +20 +00:01:01,800 --> 00:01:04,260 +have been removed, and the tokens + +21 +00:01:04,260 --> 00:01:07,323 +composing the word today +have been grouped together. + +22 +00:01:10,860 --> 00:01:13,440 +In a fast tokenizer, all these components + +23 +00:01:13,440 --> 00:01:16,413 +are gathered in the +backend_tokenizer attribute. + +24 +00:01:17,370 --> 00:01:20,070 +As you can see with +this small code snippet, + +25 +00:01:20,070 --> 00:01:22,020 +it is an instance of a tokenizer + +26 +00:01:22,020 --> 00:01:23,763 +from the tokenizers library. + +27 +00:01:25,740 --> 00:01:28,263 +So, to create your own tokenizer, + +28 +00:01:29,970 --> 00:01:31,770 +you will have to follow these steps. + +29 +00:01:33,270 --> 00:01:35,433 +First, create a training dataset. + +30 +00:01:36,690 --> 00:01:39,000 +Second, create and train a tokenizer + +31 +00:01:39,000 --> 00:01:41,700 +with the transformer library. + +32 +00:01:41,700 --> 00:01:46,700 +And third, load this tokenizer +into a transformer tokenizer. + +33 +00:01:49,350 --> 00:01:50,850 +To understand these steps, + +34 +00:01:50,850 --> 00:01:54,573 +I propose that we recreate +a BERT tokenizer together. + +35 +00:01:56,460 --> 00:01:58,893 +The first thing to do +is to create a dataset. + +36 +00:01:59,970 --> 00:02:02,460 +With this code snippet +you can create an iterator + +37 +00:02:02,460 --> 00:02:05,430 +on the dataset wikitext-2-raw-V1, + +38 +00:02:05,430 --> 00:02:08,160 +which is a rather small +dataset in English, + +39 +00:02:08,160 --> 00:02:09,730 +perfect for the example. + +40 +00:02:12,210 --> 00:02:13,920 +We attack here the big part, + +41 +00:02:13,920 --> 00:02:17,373 +the design of our tokenizer +with the tokenizer library. + +42 +00:02:18,750 --> 00:02:22,020 +We start by initializing +a tokenizer instance + +43 +00:02:22,020 --> 00:02:26,133 +with a WordPiece model because +it is the model used by BERT. + +44 +00:02:29,100 --> 00:02:32,190 +Then we can define our normalizer. + +45 +00:02:32,190 --> 00:02:35,891 +We will define it as a +succession of two normalizations + +46 +00:02:35,891 --> 00:02:39,453 +used to clean up characters +not visible in the text. + +47 +00:02:40,590 --> 00:02:43,440 +One lowercasing normalization, + +48 +00:02:43,440 --> 00:02:47,253 +and two last normalizations +used to remove accents. + +49 +00:02:49,500 --> 00:02:53,553 +For the pre-tokenization, we +will chain two pre_tokenizers. + +50 +00:02:54,390 --> 00:02:58,200 +The first one separating the +text at the level of spaces, + +51 +00:02:58,200 --> 00:03:01,533 +and the second one isolating +the punctuation marks. + +52 +00:03:03,360 --> 00:03:06,360 +Now, we can define the +trainer that will allow us + +53 +00:03:06,360 --> 00:03:09,753 +to train the WordPiece model +chosen at the beginning. + +54 +00:03:11,160 --> 00:03:12,600 +To carry out the training, + +55 +00:03:12,600 --> 00:03:14,853 +we will have to choose a vocabulary size. + +56 +00:03:16,050 --> 00:03:17,910 +Here we choose 25,000. + +57 +00:03:17,910 --> 00:03:21,270 +And we also need to +announce the special tokens + +58 +00:03:21,270 --> 00:03:24,663 +that we absolutely want +to add to our vocabulary. + +59 +00:03:29,160 --> 00:03:33,000 +In one line of code, we can +train our WordPiece model + +60 +00:03:33,000 --> 00:03:35,553 +using the iterator we defined earlier. + +61 +00:03:39,060 --> 00:03:42,570 +Once the model has been +trained, we can retrieve + +62 +00:03:42,570 --> 00:03:46,560 +the IDs of the special +class and separation tokens, + +63 +00:03:46,560 --> 00:03:49,413 +because we will need them to +post-process our sequence. + +64 +00:03:50,820 --> 00:03:52,860 +Thanks to the TemplateProcessing class, + +65 +00:03:52,860 --> 00:03:57,210 +we can add the CLS token at +the beginning of each sequence, + +66 +00:03:57,210 --> 00:04:00,120 +and the SEP token at +the end of the sequence, + +67 +00:04:00,120 --> 00:04:03,873 +and between two sentences if +we tokenize a pair of text. + +68 +00:04:07,260 --> 00:04:10,500 +Finally, we just have +to define our decoder, + +69 +00:04:10,500 --> 00:04:12,690 +which will allow us to remove the hashtags + +70 +00:04:12,690 --> 00:04:14,610 +at the beginning of the tokens + +71 +00:04:14,610 --> 00:04:17,193 +that must be reattached +to the previous token. + +72 +00:04:21,300 --> 00:04:22,260 +And there it is. + +73 +00:04:22,260 --> 00:04:25,110 +You have all the necessary lines of code + +74 +00:04:25,110 --> 00:04:29,403 +to define your own tokenizer +with the tokenizer library. + +75 +00:04:30,960 --> 00:04:32,280 +Now that we have a brand new tokenizer + +76 +00:04:32,280 --> 00:04:35,400 +with the tokenizer library, +we just have to load it + +77 +00:04:35,400 --> 00:04:38,463 +into a fast tokenizer from +the transformers library. + +78 +00:04:39,960 --> 00:04:42,630 +Here again, we have several possibilities. + +79 +00:04:42,630 --> 00:04:44,430 +We can load it in the generic class, + +80 +00:04:44,430 --> 00:04:48,330 +PreTrainedTokenizerFast, or +in the BertTokenizerFast class + +81 +00:04:48,330 --> 00:04:52,353 +since we have built a +BERT like tokenizer here. + +82 +00:04:57,000 --> 00:04:59,670 +I really hope this video +has helped you understand + +83 +00:04:59,670 --> 00:05:02,133 +how you can create your own tokenizer, + +84 +00:05:03,178 --> 00:05:06,240 +and that you are ready now to navigate + +85 +00:05:06,240 --> 00:05:08,070 +the tokenizer library documentation + +86 +00:05:08,070 --> 00:05:11,367 +to choose the components for +your brand new tokenizer. + +87 +00:05:12,674 --> 00:05:15,341 +(air whooshing) + diff --git a/subtitles/en/55_data-processing-for-token-classification.srt b/subtitles/en/55_data-processing-for-token-classification.srt index 797c6f3bc..4ddd9102d 100644 --- a/subtitles/en/55_data-processing-for-token-classification.srt +++ b/subtitles/en/55_data-processing-for-token-classification.srt @@ -1,174 +1,326 @@ -1 -00:00:05,600 --> 00:00:08,720 -Let's study how to preprocess a  -dataset for token classification!   - -2 -00:00:10,400 --> 00:00:15,840 -Token classification regroups any task that can  -be framed as labelling each word (or token) in   - -3 -00:00:15,840 --> 00:00:20,640 -a sentence, like identifying the persons,  -organizations and locations for instance.   - -4 -00:00:21,920 --> 00:00:26,720 -For our example, we will use the Conll  -dataset, in which we remove columns we   - -5 -00:00:26,720 --> 00:00:30,720 -won't use and rename the other ones to  -get to a dataset with just two columns:   - -6 -00:00:31,360 --> 00:00:37,280 -words and labels. If you have your own dataset  -for token classification, just make sure you   - -7 -00:00:37,280 --> 00:00:43,040 -clean your data to get to the same point, with  -one column containing words (as list of strings)   - -8 -00:00:43,040 --> 00:00:48,240 -and another containing labels (as integers  -spanning from to to your number of labels -1).()   - -9 -00:00:49,520 --> 00:00:53,520 -Make sure you have your label names stored  -somewhere - here we get them from the dataset   - -10 -00:00:53,520 --> 00:00:58,640 -features - so you are able to map the integers  -to some real labels when inspecting your data!   - -11 -00:01:00,480 --> 00:01:06,000 -Here we are doing named entity recognitions,  -so ours labels are either O for words that do   - -12 -00:01:06,000 --> 00:01:11,040 -not belong to any entity, LOC,  -for location, PER, for person,   - -13 -00:01:11,680 --> 00:01:19,200 -ORG for organization and MISC for miscellaneous.  -Each label has two versions: the B- labels   - -14 -00:01:19,200 --> 00:01:25,840 -indicate a word that begins an entity while the I-  -labels indicate a word that is inside an entity.   - -15 -00:01:26,880 --> 00:01:29,840 -The first step to preprocess our  -data is to tokenize the words.   - -16 -00:01:30,400 --> 00:01:35,200 -This is very easily done with a tokenizer, we just  -have to tell it we have pre-tokenized the data   - -17 -00:01:35,200 --> 00:01:42,160 -with the flag is_split_into_words. Then comes  -the hard part. Since we have added special tokens   - -18 -00:01:42,160 --> 00:01:47,200 -and each word may have been split into several  -tokens, our labels won't match the tokens anymore.   - -19 -00:01:47,840 --> 00:01:51,520 -This is where the word IDs our fast  -tokenizer provide come to the rescue.   - -20 -00:01:52,800 --> 00:01:57,440 -They match each token to the word it belongs to  -which allows us to map each token to its label.   - -21 -00:01:58,160 --> 00:02:02,080 -We just have to make sure we change the B-  -labels to their I- counterparts for tokens   - -22 -00:02:02,080 --> 00:02:08,880 -that are inside (but not at the beginning) of  -a word. The special tokens get a label of -100,   - -23 -00:02:08,880 --> 00:02:12,960 -which is how we tell the Transformer loss  -functions to ignore them when computing the loss.   - -24 -00:02:14,560 --> 00:02:19,120 -The code is then pretty straightforward, we write  -a function that shifts the labels for tokens that   - -25 -00:02:19,120 --> 00:02:23,920 -are inside a word (that you can customize) and  -use it when generating the labels for each token.   - -26 -00:02:25,600 --> 00:02:29,840 -Once that function to create our labels is  -written, we can preprocess the whole dataset using   - -27 -00:02:29,840 --> 00:02:35,840 -the map function. With the option batched=True,  -we unleash the speed of out fast tokenizers.   - -28 -00:02:36,720 --> 00:02:39,360 -The last problem comes when  -we need to create a batch.   - -29 -00:02:40,160 --> 00:02:43,680 -Unless you changed the preprocessing  -function to apply some fixed padding,   - -30 -00:02:43,680 --> 00:02:49,280 -we will get sentences of various lengths, which we  -need to pad to the same length. The padding needs   - -31 -00:02:49,280 --> 00:02:55,280 -to be applied to the inputs as well as the labels,  -since we should have one label per token. Again,   - -32 -00:02:55,280 --> 00:03:01,200 --100 indicates the labels that should be ignored  -for the loss computation. This is all done for   - -33 -00:03:01,200 --> 00:03:05,760 -us by the DataCollatorForTokenClassification,  -which you can use in PyTorch or TensorFlow.   - -34 -00:03:06,400 --> 00:03:10,960 -With all of this, you are either ready to send  -your data and this data collator to the Trainer,   - -35 -00:03:10,960 --> 00:03:17,840 -or to use the to_tf_dataset method  -and use the fit method of your model. +1 +00:00:05,730 --> 00:00:07,590 +- Let's study how to preprocess a dataset + +2 +00:00:07,590 --> 00:00:09,063 +for token classification! + +3 +00:00:10,560 --> 00:00:12,660 +Token classification regroups any task + +4 +00:00:12,660 --> 00:00:14,940 +that can be framed as labeling each word + +5 +00:00:14,940 --> 00:00:17,190 +or token in a sentence, + +6 +00:00:17,190 --> 00:00:19,530 +like identifying the +persons, organizations + +7 +00:00:19,530 --> 00:00:21,093 +and locations for instance. + +8 +00:00:22,170 --> 00:00:25,470 +For our example, we will +use the Conll dataset, + +9 +00:00:25,470 --> 00:00:27,900 +in which we remove columns we won't use + +10 +00:00:27,900 --> 00:00:29,940 +and rename the other +ones to get to a dataset + +11 +00:00:29,940 --> 00:00:32,943 +with just two columns, words and labels. + +12 +00:00:34,200 --> 00:00:36,750 +If you have your own dataset +for token classification, + +13 +00:00:36,750 --> 00:00:39,870 +just make sure you clean your +data to get to the same point, + +14 +00:00:39,870 --> 00:00:43,290 +with one column containing +words as list of strings + +15 +00:00:43,290 --> 00:00:45,540 +and another containing labels as integers + +16 +00:00:45,540 --> 00:00:48,513 +spanning from zero to your +number of labels minus one. + +17 +00:00:49,740 --> 00:00:52,290 +Make sure you have your +label names stored somewhere. + +18 +00:00:52,290 --> 00:00:54,810 +Here we get them from +the dataset features. + +19 +00:00:54,810 --> 00:00:57,660 +So you are able to map the +integers to some real labels + +20 +00:00:57,660 --> 00:00:58,960 +when inspecting your data. + +21 +00:01:00,690 --> 00:01:03,510 +Here we are doing named +entity recognitions, + +22 +00:01:03,510 --> 00:01:05,430 +so ours labels are either O + +23 +00:01:05,430 --> 00:01:08,310 +for words that do not +belong to any entity. + +24 +00:01:08,310 --> 00:01:13,310 +LOC for location, PER for +person, ORG for organization + +25 +00:01:13,860 --> 00:01:15,603 +and MISC for miscellaneous. + +26 +00:01:16,650 --> 00:01:18,540 +Each label has two versions. + +27 +00:01:18,540 --> 00:01:21,960 +The B labels indicate a +word that begins an entity + +28 +00:01:21,960 --> 00:01:25,503 +while the I labels indicate a +word that is inside an entity. + +29 +00:01:27,180 --> 00:01:28,830 +The first step to preprocess our data + +30 +00:01:28,830 --> 00:01:30,660 +is to tokenize the words. + +31 +00:01:30,660 --> 00:01:33,120 +This is very easily +done with the tokenizer. + +32 +00:01:33,120 --> 00:01:35,370 +We just have to tell it we +have pre-tokenized the data + +33 +00:01:35,370 --> 00:01:37,503 +with the flag is_split_into_words=True. + +34 +00:01:38,520 --> 00:01:40,380 +Then comes the hard part. + +35 +00:01:40,380 --> 00:01:42,360 +Since we have added special tokens + +36 +00:01:42,360 --> 00:01:45,270 +and each word may have been +split into several tokens, + +37 +00:01:45,270 --> 00:01:48,090 +our labels won't match the tokens anymore. + +38 +00:01:48,090 --> 00:01:50,670 +This is where the word IDs +our fast tokenizer provides + +39 +00:01:50,670 --> 00:01:51,723 +come to the rescue. + +40 +00:01:53,040 --> 00:01:55,500 +They match each token to +the word it belongs to + +41 +00:01:55,500 --> 00:01:58,470 +which allows us to map +each token to its label. + +42 +00:01:58,470 --> 00:01:59,303 +We just have to make sure + +43 +00:01:59,303 --> 00:02:01,710 +we change the B labels +to their I counterparts + +44 +00:02:01,710 --> 00:02:03,450 +for tokens that are inside + +45 +00:02:03,450 --> 00:02:05,433 +but not at the beginning of a word. + +46 +00:02:06,330 --> 00:02:09,120 +The special tokens get a label of -100, + +47 +00:02:09,120 --> 00:02:11,070 +which is how we tell the +Transformer loss functions + +48 +00:02:11,070 --> 00:02:14,607 +to ignore them when computing the loss. + +49 +00:02:14,607 --> 00:02:16,890 +The code is then pretty straightforward. + +50 +00:02:16,890 --> 00:02:18,660 +We write a function that shifts the labels + +51 +00:02:18,660 --> 00:02:21,840 +for tokens that are inside a +word that you can customize + +52 +00:02:21,840 --> 00:02:24,490 +and use it when generating +the labels for each token. + +53 +00:02:25,830 --> 00:02:28,260 +Once that function to create +our labels is written, + +54 +00:02:28,260 --> 00:02:31,920 +we can preprocess the whole +dataset using the map function. + +55 +00:02:31,920 --> 00:02:33,360 +With the option batched=True, + +56 +00:02:33,360 --> 00:02:35,793 +we unleash the speed +of out fast tokenizers. + +57 +00:02:37,110 --> 00:02:40,350 +The last problem comes when +we need to create a batch. + +58 +00:02:40,350 --> 00:02:42,150 +Unless you changed the +preprocessing function + +59 +00:02:42,150 --> 00:02:43,890 +to apply some fixed padding, + +60 +00:02:43,890 --> 00:02:45,900 +we will get sentences of various lengths, + +61 +00:02:45,900 --> 00:02:47,900 +which we need to pad to the same length. + +62 +00:02:48,930 --> 00:02:50,730 +The padding needs to be +applied to the inputs + +63 +00:02:50,730 --> 00:02:51,900 +as well as the labels, + +64 +00:02:51,900 --> 00:02:53,950 +since we should have one label per token. + +65 +00:02:54,870 --> 00:02:58,260 +Again, -100 indicates the +labels that should be ignored + +66 +00:02:58,260 --> 00:02:59,510 +for the loss computation. + +67 +00:03:00,420 --> 00:03:01,560 +This is all done for us + +68 +00:03:01,560 --> 00:03:04,050 +by the DataCollatorForTokenClassification, + +69 +00:03:04,050 --> 00:03:06,740 +which you can use in +PyTorch or TensorFlow. + +70 +00:03:06,740 --> 00:03:08,880 +With all of this, you are +either ready to send your data + +71 +00:03:08,880 --> 00:03:11,190 +and this data collator to the Trainer, + +72 +00:03:11,190 --> 00:03:13,320 +or use the to_tf_dataset method + +73 +00:03:13,320 --> 00:03:15,333 +and the fit method of your model. + diff --git a/subtitles/en/56_data-processing-for-masked-language-modeling.srt b/subtitles/en/56_data-processing-for-masked-language-modeling.srt index 30247046c..a411f55af 100644 --- a/subtitles/en/56_data-processing-for-masked-language-modeling.srt +++ b/subtitles/en/56_data-processing-for-masked-language-modeling.srt @@ -1,124 +1,249 @@ -1 -00:00:05,120 --> 00:00:11,360 -Let's see how we can preprocess our data  -for masked language modeling. As a reminder,   - -2 -00:00:11,360 --> 00:00:16,320 -masked language modeling is when a model  -needs to fill the blanks in a sentence.   - -3 -00:00:16,320 --> 00:00:22,400 -To do this, you just need texts, no labels,  -as this is a self-supervised problem. To apply   - -4 -00:00:22,400 --> 00:00:27,280 -this on your own data, just make sure you have all  -your texts gathered in one column of your dataset.   - -5 -00:00:28,160 --> 00:00:32,320 -Before we start randomly masking things, we  -will need to somehow make all those texts the   - -6 -00:00:32,320 --> 00:00:38,400 -same length to batch them together. The first  -way to make all the texts the same length is   - -7 -00:00:38,400 --> 00:00:43,840 -the one we used in text classification. let's  -pad the short texts and truncate the long ones.   - -8 -00:00:44,800 --> 00:00:48,400 -As we have seen when we processed  -data for text classification,   - -9 -00:00:48,400 --> 00:00:51,840 -this is all done by our tokenizer with the  -right options for padding and truncation.   - -10 -00:00:52,880 --> 00:00:57,840 -This will however make us lose a lot of texts  -if the examples in our dataset are very long,   - -11 -00:00:58,400 --> 00:01:03,040 -compared to the context length we picked.  -Here, all the portion in gray is lost.   - -12 -00:01:04,160 --> 00:01:08,320 -This is why a second way to generate samples  -of text with the same length is to chunk our   - -13 -00:01:08,320 --> 00:01:12,720 -text in pieces of context lengths, instead of  -discarding everything after the first chunk.   - -14 -00:01:13,760 --> 00:01:17,920 -There will probably be a remainder of length  -smaller than the context size, which we can   - -15 -00:01:17,920 --> 00:01:24,480 -choose to keep and pad or ignore. Here is how we  -can apply this in practice, by just adding the   - -16 -00:01:24,480 --> 00:01:30,080 -return overflowing tokens option in our tokenizer  -call. Note how this gives us a bigger dataset!   - -17 -00:01:31,280 --> 00:01:36,720 -This second way of chunking is ideal if all your  -texts are very long, but it won't work as nicely   - -18 -00:01:36,720 --> 00:01:42,640 -if you have a variety of lengths in the texts.  -In this case, the best option is to concatenate   - -19 -00:01:42,640 --> 00:01:47,600 -all your tokenized texts in one big stream, with  -a special tokens to indicate when you pass from   - -20 -00:01:47,600 --> 00:01:54,560 -one document to the other, and only then split the  -big stream into chunks. Here is how it can be done   - -21 -00:01:54,560 --> 00:02:01,200 -with code, with one loop to concatenate all the  -texts and another one to chunk it. Notice how it   - -22 -00:02:01,200 --> 00:02:05,920 -reduces the number of samples in our dataset here,  -there must have been quite a few short entries!   - -23 -00:02:07,520 --> 00:02:12,960 -Once this is done, the masking is the easy part.  -There is a data collator designed specifically for   - -24 -00:02:12,960 --> 00:02:18,240 -this in the Transformers library. You can use  -it directly in the Trainer, or when converting   - -25 -00:02:18,240 --> 00:02:29,600 -your datasets to tensorflow datasets before  -doing Keras.fit, with the to_tf_dataset method. +1 +00:00:00,000 --> 00:00:02,333 +(whooshing) + +2 +00:00:05,250 --> 00:00:07,230 +- Let's see how we can preprocess our data + +3 +00:00:07,230 --> 00:00:08,703 +for masked language modeling. + +4 +00:00:10,230 --> 00:00:12,570 +As a reminder, masked language modeling + +5 +00:00:12,570 --> 00:00:15,333 +is when a model needs to fill +the blanks in a sentence. + +6 +00:00:16,530 --> 00:00:19,650 +To do this, you just +need texts, no labels, + +7 +00:00:19,650 --> 00:00:22,200 +as this is a self-supervised problem. + +8 +00:00:22,200 --> 00:00:23,670 +To apply this on your own data, + +9 +00:00:23,670 --> 00:00:25,740 +just make sure you have +all your texts gathered + +10 +00:00:25,740 --> 00:00:27,603 +in one column of your dataset. + +11 +00:00:28,440 --> 00:00:30,480 +Before we start randomly masking things, + +12 +00:00:30,480 --> 00:00:33,090 +we will need to somehow make +all those texts the same length + +13 +00:00:33,090 --> 00:00:34,263 +to batch them together. + +14 +00:00:35,640 --> 00:00:38,490 +The first way to make all +the texts the same length + +15 +00:00:38,490 --> 00:00:40,590 +is the one we used in text classification. + +16 +00:00:41,430 --> 00:00:44,163 +Let's pad the short texts +and truncate the long ones. + +17 +00:00:45,030 --> 00:00:45,900 +As we have seen + +18 +00:00:45,900 --> 00:00:48,690 +when we processed data +for text classification, + +19 +00:00:48,690 --> 00:00:49,923 +this is all done by our tokenizer + +20 +00:00:49,923 --> 00:00:53,130 +with the right options for +padding and truncation. + +21 +00:00:53,130 --> 00:00:56,100 +This will however make +us lose a lot of texts + +22 +00:00:56,100 --> 00:00:58,620 +if the examples in our +dataset are very long, + +23 +00:00:58,620 --> 00:01:00,960 +compared to the context length we picked. + +24 +00:01:00,960 --> 00:01:03,393 +Here, all the portion in gray is lost. + +25 +00:01:04,410 --> 00:01:06,660 +This is why a second way +to generate samples of text + +26 +00:01:06,660 --> 00:01:08,820 +with the same length is to chunk our text + +27 +00:01:08,820 --> 00:01:10,560 +in pieces of context lengths, + +28 +00:01:10,560 --> 00:01:14,010 +instead of discarding everything +after the first chunk. + +29 +00:01:14,010 --> 00:01:15,420 +There will probably be a remainder + +30 +00:01:15,420 --> 00:01:17,700 +of length smaller than the context size, + +31 +00:01:17,700 --> 00:01:20,493 +which we can choose to +keep and pad or ignore. + +32 +00:01:21,570 --> 00:01:23,790 +Here is how we can apply this in practice, + +33 +00:01:23,790 --> 00:01:26,460 +by just adding the return +overflowing tokens option + +34 +00:01:26,460 --> 00:01:28,200 +in our tokenizer call. + +35 +00:01:28,200 --> 00:01:30,243 +Note how this gives us a bigger dataset! + +36 +00:01:31,560 --> 00:01:34,260 +This second way of chunking +is ideal if all your texts + +37 +00:01:34,260 --> 00:01:36,270 +are very long, but it won't work + +38 +00:01:36,270 --> 00:01:39,900 +as nicely if you have a variety +of lengths in the texts. + +39 +00:01:39,900 --> 00:01:41,040 +In this case, + +40 +00:01:41,040 --> 00:01:44,280 +the best option is to concatenate +all your tokenized texts + +41 +00:01:44,280 --> 00:01:46,560 +in one big stream, with a special tokens + +42 +00:01:46,560 --> 00:01:49,800 +to indicate when you pass from +one document to the other, + +43 +00:01:49,800 --> 00:01:52,503 +and only then split the +big stream into chunks. + +44 +00:01:53,760 --> 00:01:55,620 +Here is how it can be done with code, + +45 +00:01:55,620 --> 00:01:58,230 +with one loop to concatenate all the texts + +46 +00:01:58,230 --> 00:01:59,673 +and another one to chunk it. + +47 +00:02:00,780 --> 00:02:02,850 +Notice how it reduces +the number of samples + +48 +00:02:02,850 --> 00:02:04,230 +in our dataset here, + +49 +00:02:04,230 --> 00:02:06,580 +there must have been +quite a few short entries! + +50 +00:02:07,710 --> 00:02:11,130 +Once this is done, the +masking is the easy part. + +51 +00:02:11,130 --> 00:02:13,400 +There is a data collator +designed specifically for this + +52 +00:02:13,400 --> 00:02:15,540 +in the Transformers library. + +53 +00:02:15,540 --> 00:02:17,700 +You can use it directly in the Trainer, + +54 +00:02:17,700 --> 00:02:20,400 +or when converting your +datasets to tensorflow datasets + +55 +00:02:20,400 --> 00:02:23,703 +before doing Keras.fit, with +the to_tf_dataset method. + +56 +00:02:24,992 --> 00:02:27,325 +(whooshing) + diff --git a/subtitles/en/57_what-is-perplexity.srt b/subtitles/en/57_what-is-perplexity.srt index b7fb95c7c..cdc14eb17 100644 --- a/subtitles/en/57_what-is-perplexity.srt +++ b/subtitles/en/57_what-is-perplexity.srt @@ -1,99 +1,231 @@ -1 -00:00:05,280 --> 00:00:09,200 -In this video we take a look at the  -mysterious sounding metric called Perplexity.   - -2 -00:00:10,880 --> 00:00:14,880 -You might have encountered perplexity  -when reading about generative models.   - -3 -00:00:14,880 --> 00:00:19,760 -You can see two examples here from the original  -transformer paper “Attention is all you need”   - -4 -00:00:19,760 --> 00:00:25,600 -as well as the more recent GPT-2 paper. Perplexity  -is a common metric to measure the performance   - -5 -00:00:25,600 --> 00:00:30,880 -of language models. The smaller the value the  -better the performance. But what does it actually   - -6 -00:00:30,880 --> 00:00:36,880 -mean and how can we calculate it? A very common  -quantity in machine learning is the likelihood.   - -7 -00:00:37,440 --> 00:00:41,280 -We can calculate the likelihood as the  -product of each token’s probability   - -8 -00:00:42,160 --> 00:00:47,200 -What this means is that for each token we use  -the language model to predict its probability   - -9 -00:00:47,200 --> 00:00:52,960 -based on the previous tokens. In the end we  -multiply all probabilities to get the Likelihood.   - -10 -00:00:55,680 --> 00:00:59,120 -With the likelihood we can calculate  -another important quantity:   - -11 -00:00:59,120 --> 00:01:04,560 -the cross entropy. You might already have heard  -about cross-entropy when looking at loss function.   - -12 -00:01:05,440 --> 00:01:08,480 -Cross-entropy is often used as a  -loss function in classification.   - -13 -00:01:09,040 --> 00:01:14,720 -In language modeling we predict the next  -token which also is a classification task.   - -14 -00:01:15,600 --> 00:01:20,400 -Therefore, if we want to calculate the cross  -entropy of an example we can simply pass it to the   - -15 -00:01:20,400 --> 00:01:25,840 -model with the inputs as labels. The loss return  -by the model then corresponds the cross entropy.   - -16 -00:01:28,880 --> 00:01:32,640 -We are now only a single operation  -away from calculating the perplexity.   - -17 -00:01:33,280 --> 00:01:39,360 -By exponentiating the cross-entropy we get the  -perplexity. So you see that the perplexity is   - -18 -00:01:39,360 --> 00:01:55,040 -closely related to the loss. Keep in mind that  -the loss is only a weak proxy for a model’s   - -19 -00:01:55,040 --> 00:02:01,600 -ability to generate quality text and the same is  -true for perplexity. For this reason one usually   - -20 -00:02:01,600 --> 00:02:07,840 -also calculates more sophisticated metrics  -such as BLEU or ROUGE on generative tasks. +1 +00:00:00,095 --> 00:00:01,582 +(screen whooshing) + +2 +00:00:01,582 --> 00:00:02,659 +(sticker popping) + +3 +00:00:02,659 --> 00:00:05,379 +(screen whooshing) + +4 +00:00:05,379 --> 00:00:06,720 +- In this video, we take a look + +5 +00:00:06,720 --> 00:00:09,483 +at the mysterious sounding +metric called perplexity. + +6 +00:00:11,070 --> 00:00:12,630 +You might have encountered perplexity + +7 +00:00:12,630 --> 00:00:14,970 +when reading about generative models. + +8 +00:00:14,970 --> 00:00:16,680 +You can see two examples here, + +9 +00:00:16,680 --> 00:00:18,577 +one from the original transformer paper, + +10 +00:00:18,577 --> 00:00:19,950 +"Attention is all you need," + +11 +00:00:19,950 --> 00:00:23,340 +and the other one from the +more recent GPT-2 paper. + +12 +00:00:23,340 --> 00:00:25,740 +Perplexity is a common metric +to measure the performance + +13 +00:00:25,740 --> 00:00:27,150 +of language models. + +14 +00:00:27,150 --> 00:00:30,000 +The smaller its value, the +better the performance. + +15 +00:00:30,000 --> 00:00:32,950 +But what does it actually mean +and how can we calculate it? + +16 +00:00:34,440 --> 00:00:36,180 +A very common quantity in machine learning + +17 +00:00:36,180 --> 00:00:37,650 +is the likelihood. + +18 +00:00:37,650 --> 00:00:39,240 +We can calculate the likelihood + +19 +00:00:39,240 --> 00:00:42,390 +as the product of each +token's probability. + +20 +00:00:42,390 --> 00:00:44,730 +What this means is that for each token, + +21 +00:00:44,730 --> 00:00:47,340 +we use the language model +to predict its probability + +22 +00:00:47,340 --> 00:00:49,560 +based on the previous tokens. + +23 +00:00:49,560 --> 00:00:52,050 +In the end, we multiply all probabilities + +24 +00:00:52,050 --> 00:00:53,253 +to get the likelihood. + +25 +00:00:55,892 --> 00:00:57,000 +With the likelihood, + +26 +00:00:57,000 --> 00:00:59,340 +we can calculate another +important quantity, + +27 +00:00:59,340 --> 00:01:01,200 +the cross-entropy. + +28 +00:01:01,200 --> 00:01:03,450 +You might have already +heard about cross-entropy + +29 +00:01:03,450 --> 00:01:05,670 +when looking at loss function. + +30 +00:01:05,670 --> 00:01:09,210 +It is often used as a loss +function in classification. + +31 +00:01:09,210 --> 00:01:11,610 +In language modeling, we +predict the next token + +32 +00:01:11,610 --> 00:01:12,930 +based on the previous token, + +33 +00:01:12,930 --> 00:01:15,810 +which is also a classification task. + +34 +00:01:15,810 --> 00:01:17,340 +Therefore, if we want to calculate + +35 +00:01:17,340 --> 00:01:19,290 +the cross-entropy of an example, + +36 +00:01:19,290 --> 00:01:21,090 +we can simply pass it to the model + +37 +00:01:21,090 --> 00:01:23,580 +with its inputs as labels. + +38 +00:01:23,580 --> 00:01:26,433 +The loss then corresponds +to the cross-entropy. + +39 +00:01:29,130 --> 00:01:31,110 +We are now only a single operation away + +40 +00:01:31,110 --> 00:01:33,510 +from calculating the perplexity. + +41 +00:01:33,510 --> 00:01:37,710 +By exponentiating the cross-entropy, +we get the perplexity. + +42 +00:01:37,710 --> 00:01:40,260 +So you see that the +perplexity is closely related + +43 +00:01:40,260 --> 00:01:41,163 +to the loss. + +44 +00:01:42,060 --> 00:01:43,380 +Plugging in previous results + +45 +00:01:43,380 --> 00:01:47,010 +shows that this is +equivalent to exponentiating + +46 +00:01:47,010 --> 00:01:51,033 +the negative average lock +probability of each token. + +47 +00:01:52,050 --> 00:01:54,630 +Keep in mind that the +loss is only a weak proxy + +48 +00:01:54,630 --> 00:01:57,360 +for a model's ability +to generate quality text + +49 +00:01:57,360 --> 00:02:00,510 +and the same is true for perplexity. + +50 +00:02:00,510 --> 00:02:02,550 +For this reason, one +usually also calculates + +51 +00:02:02,550 --> 00:02:03,840 +more sophisticated metrics + +52 +00:02:03,840 --> 00:02:07,413 +such as BLEU or ROUGE on generative tasks. + +53 +00:02:08,551 --> 00:02:11,468 +(screen whooshing) + diff --git a/subtitles/en/58_what-is-domain-adaptation.srt b/subtitles/en/58_what-is-domain-adaptation.srt index a83e76cb9..5d4f8c9d4 100644 --- a/subtitles/en/58_what-is-domain-adaptation.srt +++ b/subtitles/en/58_what-is-domain-adaptation.srt @@ -1,89 +1,185 @@ -1 -00:00:05,840 --> 00:00:12,400 -What is domain adaptation? When fine-tuning  -a pretrained model on a new dataset,   - -2 -00:00:12,400 --> 00:00:17,200 -the fine-tuned model we obtain will make  -predictions that are attuned to this new dataset.   - -3 -00:00:18,640 --> 00:00:23,440 -When the two models are trained with the same  -task, we can then compare their predictions   - -4 -00:00:23,440 --> 00:00:27,600 -on the same input. The predictions  -of the two models will be different,   - -5 -00:00:27,600 --> 00:00:32,640 -in a way that reflects the differences  -between the two datasets, a phenomenon we call   - -6 -00:00:32,640 --> 00:00:39,840 -domain adaptation. Let's look at an example with  -mask language modeling, by comparing the outputs   - -7 -00:00:39,840 --> 00:00:44,400 -of the pretrained distilBERT model with the  -version fine-tuned in chapter 7 of the course   - -8 -00:00:44,400 --> 00:00:50,800 -(linked below). The pretrained model makes generic  -predictions, whereas the fine-tuned model has its   - -9 -00:00:50,800 --> 00:00:57,040 -first two predictions linked to cinema. Since  -it was fine-tuned on a movie reviews dataset,   - -10 -00:00:57,040 --> 00:01:00,320 -it's perfectly normal to see it  -adapted its suggestions like this.   - -11 -00:01:01,200 --> 00:01:05,520 -Notice how it keeps the same predictions as  -the pretrained model afterward. Even if the   - -12 -00:01:05,520 --> 00:01:09,920 -fine-tuned model adapts to the new dataset,  -it's not forgetting what it was pretrained on.   - -13 -00:01:11,200 --> 00:01:17,120 -This is another example on a translation task.  -On top we use a pretrained French/English model   - -14 -00:01:17,120 --> 00:01:22,720 -and at the bottom, the version we fine-tuned in  -chapter 7. The top model is pretrained on lots of   - -15 -00:01:22,720 --> 00:01:27,440 -texts, and leaves technical English terms like  -plugin and email unchanged in the translation   - -16 -00:01:28,160 --> 00:01:33,360 -(both are perfectly understood by French people).  -The dataset picked for the fine-tuning is a   - -17 -00:01:33,360 --> 00:01:38,240 -dataset of technical texts where special attention  -was picked to translate everything in French.   - -18 -00:01:38,960 --> 00:01:50,560 -As a result, the fine-tuned model picked that  -habit and translated both plugin and email. +1 +00:00:00,000 --> 00:00:01,402 +(air whooshing) + +2 +00:00:01,402 --> 00:00:02,720 +(smiley snapping) + +3 +00:00:02,720 --> 00:00:05,910 +(air whooshing) + +4 +00:00:05,910 --> 00:00:07,923 +- What is domain adaptation? + +5 +00:00:09,540 --> 00:00:12,540 +When fine-tuning a pre-trained +model on a new dataset, + +6 +00:00:12,540 --> 00:00:15,480 +the fine-tuned model we +obtain will make predictions + +7 +00:00:15,480 --> 00:00:17,433 +that are attuned to this new dataset. + +8 +00:00:18,840 --> 00:00:21,840 +When the two models are +trained with the same task, + +9 +00:00:21,840 --> 00:00:25,320 +we can then compare their +predictions on the same input. + +10 +00:00:25,320 --> 00:00:27,870 +The predictions of the two +models will be different + +11 +00:00:27,870 --> 00:00:29,790 +in a way that reflects the differences + +12 +00:00:29,790 --> 00:00:31,680 +between the two datasets, + +13 +00:00:31,680 --> 00:00:34,053 +a phenomenon we call domain adaptation. + +14 +00:00:35,310 --> 00:00:38,640 +Let's look at an example +with masked language modeling + +15 +00:00:38,640 --> 00:00:41,910 +by comparing the outputs of the +pre-trained DistilBERT model + +16 +00:00:41,910 --> 00:00:43,080 +with the version fine-tuned + +17 +00:00:43,080 --> 00:00:45,273 +in chapter 7 of the course, linked below. + +18 +00:00:46,500 --> 00:00:49,140 +The pre-trained model +makes generic predictions, + +19 +00:00:49,140 --> 00:00:50,580 +whereas the fine-tuned model + +20 +00:00:50,580 --> 00:00:53,253 +has its first two +predictions linked to cinema. + +21 +00:00:54,390 --> 00:00:57,210 +Since it was fine-tuned on +a movie reviews dataset, + +22 +00:00:57,210 --> 00:00:58,680 +it's perfectly normal to see + +23 +00:00:58,680 --> 00:01:01,440 +it adapted its suggestions like this. + +24 +00:01:01,440 --> 00:01:03,090 +Notice how it keeps the same prediction + +25 +00:01:03,090 --> 00:01:05,220 +as the pre-trained model afterward. + +26 +00:01:05,220 --> 00:01:08,100 +Even if the fine-tuned model +adapts to the new dataset, + +27 +00:01:08,100 --> 00:01:10,450 +it's not forgetting what +it was pre-trained on. + +28 +00:01:11,490 --> 00:01:14,220 +This is another example +on a translation task. + +29 +00:01:14,220 --> 00:01:17,310 +On top, we use a pre-trained +French/English model, + +30 +00:01:17,310 --> 00:01:21,330 +and at the bottom, the version +we fine-tuned in chapter 7. + +31 +00:01:21,330 --> 00:01:23,610 +The top model is pre-trained +on lots of texts, + +32 +00:01:23,610 --> 00:01:25,170 +and leaves technical English terms, + +33 +00:01:25,170 --> 00:01:28,350 +like plugin and email, +unchanged in the translation. + +34 +00:01:28,350 --> 00:01:31,350 +Both are perfectly +understood by French people. + +35 +00:01:31,350 --> 00:01:33,780 +The dataset picked for the +fine-tuning is a dataset + +36 +00:01:33,780 --> 00:01:36,660 +of technical texts where +special attention was picked + +37 +00:01:36,660 --> 00:01:39,150 +on translating everything in French. + +38 +00:01:39,150 --> 00:01:42,090 +As a result, the fine-tuned +model picked that habit + +39 +00:01:42,090 --> 00:01:44,193 +and translated both plugin and email. + +40 +00:01:45,942 --> 00:01:49,181 +(air whooshing) + +41 +00:01:49,181 --> 00:01:50,592 +(air whooshing) + diff --git a/subtitles/en/59_data-processing-for-translation.srt b/subtitles/en/59_data-processing-for-translation.srt index c5d8a99bd..aaddd1f56 100644 --- a/subtitles/en/59_data-processing-for-translation.srt +++ b/subtitles/en/59_data-processing-for-translation.srt @@ -1,158 +1,247 @@ -1 -00:00:05,670 --> 00:00:09,630 -Let's see how to preprocess a dataset for -translation. - -2 -00:00:09,630 --> 00:00:13,269 -This is the task of well translating a sentence -in another language. - -3 -00:00:13,269 --> 00:00:18,110 -This video will focus on how to preprocess -your dataset once you have managed to put - -4 -00:00:18,110 --> 00:00:23,090 -it in the following format: one column for -the input texts, and one for the target texts. - -5 -00:00:23,090 --> 00:00:28,439 -Here is how we can achieve this with the Datasets -library on the KDE4 dataset for English to - -6 -00:00:28,439 --> 00:00:30,960 -French translation. - -7 -00:00:30,960 --> 00:00:35,360 -As long as you manage to have your data look -like this, you should be able to follow the - -8 -00:00:35,360 --> 00:00:36,769 -same steps. - -9 -00:00:36,769 --> 00:00:41,550 -For once, our labels are not integers corresponding -to some classes, but plain text. - -10 -00:00:41,550 --> 00:00:44,760 -We will thus need to tokenize them, like our -inputs. - -11 -00:00:44,760 --> 00:00:50,820 -There is a trap there though, as if you tokenize -your targets like your inputs, you will hit - -12 -00:00:50,820 --> 00:00:51,820 -a problem. - -13 -00:00:51,820 --> 00:00:55,829 -Even if you don't speak French, you might -notice some weird things in the tokenization - -14 -00:00:55,829 --> 00:01:01,800 -of the targets: most of the words are tokenized -in several subtokens, while "fish", one of - -15 -00:01:01,800 --> 00:01:05,799 -the only English word, is tokenized as a single -word. - -16 -00:01:05,799 --> 00:01:09,760 -That's because our inputs have been tokenized -as English. - -17 -00:01:09,760 --> 00:01:13,939 -Since our model knows two languages, you have -to warn it when tokenizing the targets, so - -18 -00:01:13,939 --> 00:01:16,360 -it switches in French mode. - -19 -00:01:16,360 --> 00:01:20,090 -This is done with the as_target_tokenizer -context manager. - -20 -00:01:20,090 --> 00:01:24,900 -You can see how it results in a more compact -tokenization. - -21 -00:01:24,900 --> 00:01:28,509 -Processing the whole dataset is then super -easy with the map function. - -22 -00:01:28,509 --> 00:01:32,900 -You can pick different maximum lengths for -the input and targets, and choose to pad at - -23 -00:01:32,900 --> 00:01:37,210 -this stage to that maximum length by setting -padding=max_length. - -24 -00:01:37,210 --> 00:01:42,540 -Here we will show you how to pad dynamically -as it requires one more step. - -25 -00:01:42,540 --> 00:01:45,560 -Your inputs and targets are all sentence of -various lengths. - -26 -00:01:45,560 --> 00:01:50,470 -We will pad the inputs and targets separately -as the maximum length of the inputs and targets - -27 -00:01:50,470 --> 00:01:52,740 -might be different. - -28 -00:01:52,740 --> 00:01:57,259 -Then we pad the inputs with the pad token -and the targets with the -100 index, to make - -29 -00:01:57,259 --> 00:02:01,470 -sure they are not taken into account in the -loss computation. - -30 -00:02:01,470 --> 00:02:04,869 -Once this is done, batching inputs and targets -become super easy! - -31 -00:02:04,869 --> 00:02:10,220 -The Transformers library provides us with -a data collator to do this all automatically. - -32 -00:02:10,220 --> 00:02:15,920 -You can then pass it to the Trainer with your -datasets, or use it in the to_tf_dataset method - -33 -00:02:15,920 --> 00:02:17,410 -before using model.fit(). +1 +00:00:00,449 --> 00:00:01,559 +(air whooshing) + +2 +00:00:01,559 --> 00:00:02,767 +(logo popping) + +3 +00:00:02,767 --> 00:00:05,670 +(metal sliding) + +4 +00:00:05,670 --> 00:00:08,470 +- Let's see how to preprocess +a dataset for translation. + +5 +00:00:09,540 --> 00:00:12,420 +This is a task of well +translating a sentence + +6 +00:00:12,420 --> 00:00:14,310 +in another language. + +7 +00:00:14,310 --> 00:00:17,100 +This video will focus on how +to preprocess your dataset + +8 +00:00:17,100 --> 00:00:19,950 +once you've managed to put +it in the following format. + +9 +00:00:19,950 --> 00:00:23,730 +One column for input texts +and one for the target texts. + +10 +00:00:23,730 --> 00:00:25,980 +Here is how we can achieve +this with the Datasets library + +11 +00:00:25,980 --> 00:00:29,643 +and the KDE4 dataset for +English to French translation. + +12 +00:00:30,870 --> 00:00:33,240 +As long as you manage to have +your data look like this, + +13 +00:00:33,240 --> 00:00:35,440 +you should be able to +follow the same steps. + +14 +00:00:36,630 --> 00:00:39,210 +For once, our labels are not integers + +15 +00:00:39,210 --> 00:00:42,210 +corresponding to some +classes, but plain texts. + +16 +00:00:42,210 --> 00:00:45,810 +We will thus need to tokenize +them, like our inputs. + +17 +00:00:45,810 --> 00:00:47,370 +There is a trap there though, + +18 +00:00:47,370 --> 00:00:49,890 +as if you tokenize your +targets like your inputs, + +19 +00:00:49,890 --> 00:00:51,690 +you will hit a problem. + +20 +00:00:51,690 --> 00:00:54,090 +Even if you don't speak +French, you might notice + +21 +00:00:54,090 --> 00:00:57,270 +some weird things in the +tokenization of the targets. + +22 +00:00:57,270 --> 00:01:00,510 +Most of the words are +tokenized in several subtokens, + +23 +00:01:00,510 --> 00:01:03,180 +while fish, one of the only English word, + +24 +00:01:03,180 --> 00:01:05,670 +is tokenized as a single word. + +25 +00:01:05,670 --> 00:01:08,703 +That's because our inputs have +been tokenized as English. + +26 +00:01:09,660 --> 00:01:11,430 +Since our model knows two languages, + +27 +00:01:11,430 --> 00:01:13,800 +you have to warn it when +tokenizing the targets + +28 +00:01:13,800 --> 00:01:16,230 +so it switches in French mode. + +29 +00:01:16,230 --> 00:01:20,010 +This is done with the +as_target_tokenizer context manager. + +30 +00:01:20,010 --> 00:01:23,343 +You can see how it results in +a more compact tokenization. + +31 +00:01:24,810 --> 00:01:25,890 +Processing the whole dataset + +32 +00:01:25,890 --> 00:01:28,440 +is then super easy with the map function. + +33 +00:01:28,440 --> 00:01:30,207 +You can pick different maximum lengths + +34 +00:01:30,207 --> 00:01:32,100 +for the inputs and targets, + +35 +00:01:32,100 --> 00:01:34,530 +and choose to pad at this +stage to that maximum length + +36 +00:01:34,530 --> 00:01:36,273 +by setting padding=max_length. + +37 +00:01:37,230 --> 00:01:39,300 +Here we'll show you to pad dynamically + +38 +00:01:39,300 --> 00:01:41,013 +as it requires one more step. + +39 +00:01:42,450 --> 00:01:43,470 +Your inputs and targets + +40 +00:01:43,470 --> 00:01:46,080 +are all sentences of various lengths. + +41 +00:01:46,080 --> 00:01:48,510 +We will pad the inputs +and targets separately, + +42 +00:01:48,510 --> 00:01:50,460 +as the maximum lengths +of the inputs and targets + +43 +00:01:50,460 --> 00:01:51,483 +might be different. + +44 +00:01:52,620 --> 00:01:54,540 +Then we pad the inputs with the pad token + +45 +00:01:54,540 --> 00:01:57,060 +and the targets with the -100 index + +46 +00:01:57,060 --> 00:01:58,890 +to make sure they're +not taken into account + +47 +00:01:58,890 --> 00:02:00,123 +in the loss computation. + +48 +00:02:01,320 --> 00:02:02,153 +Once this is done, + +49 +00:02:02,153 --> 00:02:04,340 +batching inputs and +targets become super easy. + +50 +00:02:05,670 --> 00:02:08,220 +The Transformers library +provides us with data collator + +51 +00:02:08,220 --> 00:02:10,500 +to do this all automatically. + +52 +00:02:10,500 --> 00:02:13,800 +You can then pass it to the +Trainer with your datasets + +53 +00:02:13,800 --> 00:02:15,960 +or use it in the to_tf_dataset method + +54 +00:02:15,960 --> 00:02:18,560 +before using model.fit() +on your (indistinct) model. + +55 +00:02:21,057 --> 00:02:23,724 +(air whooshing) + diff --git a/subtitles/en/60_what-is-the-bleu-metric.srt b/subtitles/en/60_what-is-the-bleu-metric.srt index ac7da0bd6..6231d1ca5 100644 --- a/subtitles/en/60_what-is-the-bleu-metric.srt +++ b/subtitles/en/60_what-is-the-bleu-metric.srt @@ -1,274 +1,540 @@ -1 -00:00:05,520 --> 00:00:12,080 -What is the BLEU metric? For many NLP tasks we  -can use common metrics like accuracy or F1 score,   - -2 -00:00:12,080 --> 00:00:15,280 -but what do you do when you want to measure  -the quality of text that's generated from a   - -3 -00:00:15,280 --> 00:00:19,680 -model like GPT-2? In this video, we'll take a look  -at a widely used metric for machine translation   - -4 -00:00:19,680 --> 00:00:22,960 -called BLEU, which is short for BiLingual  -Evaluation Understudy. The basic idea behind   - -5 -00:00:22,960 --> 00:00:27,280 -BLEU is to assign a single numerical score to  -a translation that tells us how "good" it is   - -6 -00:00:27,280 --> 00:00:32,080 -compared to one or more reference translations.  -In this example we have a sentence in Spanish that   - -7 -00:00:32,080 --> 00:00:37,280 -has been translated into English by some model.  -If we compare the generated translation to some   - -8 -00:00:37,280 --> 00:00:42,160 -reference human translations, we can see that the  -model is pretty good, but has made a common error:   - -9 -00:00:42,960 --> 00:00:48,000 -the Spanish word "tengo" means "have" in English  -and this 1-1 translation is not quite natural.   - -10 -00:00:49,680 --> 00:00:53,280 -So how can we measure the quality of a  -generated translation in an automatic way?   - -11 -00:00:54,080 --> 00:00:58,000 -The approach that BLEU takes is to compare the  -n-grams of the generated translation to the   - -12 -00:00:58,000 --> 00:01:03,760 -n-grams of the references. An n-gram is just  -a fancy way of saying "a chunk of n words",   - -13 -00:01:03,760 --> 00:01:07,280 -so let's start with unigrams, which correspond  -to the individual words in a sentence.   - -14 -00:01:08,720 --> 00:01:12,160 -In this example you can see that four of  -the words in the generated translation   - -15 -00:01:12,160 --> 00:01:18,000 -are also found in one of the reference  -translations. Now that we've found our matches,   - -16 -00:01:18,000 --> 00:01:21,920 -one way to assign a score to the translation  -is to compute the precision of the unigrams.   - -17 -00:01:22,880 --> 00:01:27,200 -This means we just count the number of matching  -words in the generated and reference translations   - -18 -00:01:27,200 --> 00:01:30,400 -and normalize the count by dividing by  -the number of word in the generation.   - -19 -00:01:31,600 --> 00:01:35,600 -In this example, we found 4 matching  -words and our generation has 5 words,   - -20 -00:01:36,960 --> 00:01:40,320 -so our unigram precision is 4/5 or 0.8. In general  -precision ranges from 0 to 1, and higher precision   - -21 -00:01:40,320 --> 00:01:48,160 -scores mean a better translation. One problem  -with unigram precision is that translation models   - -22 -00:01:48,160 --> 00:01:51,840 -sometimes get stuck in repetitive patterns  -and repeat the same word several times.   - -23 -00:01:52,960 --> 00:01:56,240 -If we just count the number of word matches,  -we can get really high precision scores   - -24 -00:01:56,240 --> 00:01:58,720 -even though the translation is  -terrible from a human perspective!   - -25 -00:01:59,840 --> 00:02:04,640 -For example, if our model just generates the word  -"six", we get a perfect unigram precision score.   - -26 -00:02:07,040 --> 00:02:12,000 -To handle this, BLEU uses a modified precision  -that clips the number of times to count a word,   - -27 -00:02:12,000 --> 00:02:14,960 -based on the maximum number of times it  -appears in the reference translation.   - -28 -00:02:16,160 --> 00:02:19,360 -In this example, the word "six"  -only appears once in the reference,   - -29 -00:02:19,360 --> 00:02:23,840 -so we clip the numerator to one and the modified  -unigram precision now gives a much lower score.   - -30 -00:02:27,440 --> 00:02:31,600 -Another problem with unigram precision is that  -it doesn't take into account the order of the   - -31 -00:02:31,600 --> 00:02:37,200 -words in the translations. For example, suppose  -we had Yoda translate our Spanish sentence,   - -32 -00:02:37,200 --> 00:02:43,120 -then we might get something backwards like  -"years six thirty have I". In this case,   - -33 -00:02:43,120 --> 00:02:46,560 -the modified unigram precision gives a  -high precision which is not what we want.   - -34 -00:02:48,240 --> 00:02:52,400 -So to deal with word ordering problems, BLEU  -actually computes the precision for several   - -35 -00:02:52,400 --> 00:02:57,360 -different n-grams and then averages the result.  -For example, if we compare 4-grams, then we can   - -36 -00:02:57,360 --> 00:03:03,840 -see there are no matching chunks of 4 words in  -translations and so the 4-gram precision is 0.   - -37 -00:03:05,440 --> 00:03:10,880 -To compute BLEU scores in Hugging Face Datasets is  -very simple: just use the load_metric() function,   - -38 -00:03:10,880 --> 00:03:13,840 -provide your model's predictions along  -with the references and you're good to go!   - -39 -00:03:16,240 --> 00:03:19,920 -The output contains several fields  -of interest. The precisions field   - -40 -00:03:19,920 --> 00:03:22,800 -contains all the individual  -precision scores for each n-gram.   - -41 -00:03:24,800 --> 00:03:30,320 -The BLEU score itself is then calculated by taking  -the geometric mean of the precision scores. By   - -42 -00:03:30,320 --> 00:03:34,880 -default, the mean of all four n-gram precisions is  -reported, a metric that is sometimes also called   - -43 -00:03:34,880 --> 00:03:40,480 -BLEU-4. In this example we can see the BLEU score  -is zero because the 4-gram precision was zero.   - -44 -00:03:43,440 --> 00:03:46,640 -The BLEU metric has some nice properties,  -but it is far from a perfect metric.   - -45 -00:03:47,280 --> 00:03:51,520 -The good properties are that it's easy to compute  -and widely used in research so you can compare   - -46 -00:03:51,520 --> 00:03:56,560 -your model against others on a benchmark. On the  -other hand, there are several problems with BLEU,   - -47 -00:03:56,560 --> 00:04:00,560 -including the fact it doesn't incorporate  -semantics and struggles on non-English languages.   - -48 -00:04:01,680 --> 00:04:04,560 -Another problem with BLEU is that it  -assumes the human translations have   - -49 -00:04:04,560 --> 00:04:08,400 -already been tokenized and this makes it hard  -to compare models with different tokenizers.   - -50 -00:04:11,200 --> 00:04:15,280 -Measuring the quality of texts is still a  -difficult, open problem in NLP research.   - -51 -00:04:15,280 --> 00:04:17,680 -For machine translation, the  -current recommendation is to   - -52 -00:04:17,680 --> 00:04:21,600 -use the SacreBLEU metric which addresses  -the tokenization limitations of BLEU.   - -53 -00:04:22,640 --> 00:04:26,560 -As you can see in this example, computing  -the SacreBLEU score is almost identical to   - -54 -00:04:26,560 --> 00:04:30,800 -the BLEU one. The main difference is that we  -now pass a list of texts instead of a list   - -55 -00:04:30,800 --> 00:04:41,200 -of words for the translations, and SacreBLEU  -takes care of the tokenization under the hood. +1 +00:00:00,147 --> 00:00:01,412 +(screen whooshing) + +2 +00:00:01,412 --> 00:00:02,698 +(sticker popping) + +3 +00:00:02,698 --> 00:00:05,670 +(screen whooshing) + +4 +00:00:05,670 --> 00:00:07,650 +- What is the BLEU metric? + +5 +00:00:07,650 --> 00:00:10,170 +For many NLP tasks we +can use common metrics + +6 +00:00:10,170 --> 00:00:12,810 +like accuracy or F1 +score, but what do you do + +7 +00:00:12,810 --> 00:00:14,340 +when you wanna measure the quality of text + +8 +00:00:14,340 --> 00:00:16,560 +that's been translated from a model? + +9 +00:00:16,560 --> 00:00:18,750 +In this video, we'll take a +look at a widely used metric + +10 +00:00:18,750 --> 00:00:20,613 +for machine translation called BLEU. + +11 +00:00:22,290 --> 00:00:23,940 +The basic idea behind BLEU is to assign + +12 +00:00:23,940 --> 00:00:26,250 +a single numerical score to a translation + +13 +00:00:26,250 --> 00:00:27,450 +that tells us how good it is + +14 +00:00:27,450 --> 00:00:30,199 +compared to one or more +reference translations. + +15 +00:00:30,199 --> 00:00:32,130 +In this example, we have +a sentence in Spanish + +16 +00:00:32,130 --> 00:00:35,340 +that has been translated +into English by some model. + +17 +00:00:35,340 --> 00:00:37,170 +If we compare the generated translation + +18 +00:00:37,170 --> 00:00:39,150 +to some reference human translations, + +19 +00:00:39,150 --> 00:00:41,190 +we can see that the model +is actually pretty good, + +20 +00:00:41,190 --> 00:00:43,260 +but has made a common error. + +21 +00:00:43,260 --> 00:00:46,050 +The Spanish word tengo +means have in English, + +22 +00:00:46,050 --> 00:00:48,700 +and this one-to-one translation +is not quite natural. + +23 +00:00:49,890 --> 00:00:51,270 +So how can we measure the quality + +24 +00:00:51,270 --> 00:00:54,270 +of a generated translation +in some automatic way? + +25 +00:00:54,270 --> 00:00:56,730 +The approach that BLEU takes +is to compare the n-grams + +26 +00:00:56,730 --> 00:00:58,550 +of the generated +translation to the n-grams + +27 +00:00:58,550 --> 00:01:00,390 +in the references. + +28 +00:01:00,390 --> 00:01:02,400 +Now, an n-gram is just +a fancy way of saying + +29 +00:01:02,400 --> 00:01:03,960 +a chunk of n words. + +30 +00:01:03,960 --> 00:01:05,220 +So let's start with unigrams, + +31 +00:01:05,220 --> 00:01:08,020 +which corresponds to the +individual words in a sentence. + +32 +00:01:08,880 --> 00:01:11,250 +In this example, you can +see that four of the words + +33 +00:01:11,250 --> 00:01:13,140 +in the generated +translation are also found + +34 +00:01:13,140 --> 00:01:14,990 +in one of the reference translations. + +35 +00:01:16,350 --> 00:01:18,240 +And once we've found our matches, + +36 +00:01:18,240 --> 00:01:20,130 +one way to assign a +score to the translation + +37 +00:01:20,130 --> 00:01:23,070 +is to compute the +precision of the unigrams. + +38 +00:01:23,070 --> 00:01:25,200 +This means we just count +the number of matching words + +39 +00:01:25,200 --> 00:01:27,360 +in the generated and +reference translations + +40 +00:01:27,360 --> 00:01:29,660 +and normalize the count by +dividing by the number of words + +41 +00:01:29,660 --> 00:01:30,753 +in the generation. + +42 +00:01:31,800 --> 00:01:34,080 +In this example, we +found four matching words + +43 +00:01:34,080 --> 00:01:36,033 +and our generation has five words. + +44 +00:01:37,140 --> 00:01:39,690 +Now, in general, precision +ranges from zero to one, + +45 +00:01:39,690 --> 00:01:42,390 +and higher precision scores +mean a better translation. + +46 +00:01:44,160 --> 00:01:45,570 +But this isn't really the whole story + +47 +00:01:45,570 --> 00:01:47,310 +because one problem with unigram precision + +48 +00:01:47,310 --> 00:01:49,140 +is that translation +models sometimes get stuck + +49 +00:01:49,140 --> 00:01:51,330 +in repetitive patterns and +just repeat the same word + +50 +00:01:51,330 --> 00:01:52,293 +several times. + +51 +00:01:53,160 --> 00:01:54,690 +If we just count the +number of word matches, + +52 +00:01:54,690 --> 00:01:56,370 +we can get really high precision scores + +53 +00:01:56,370 --> 00:01:57,840 +even though the translation is terrible + +54 +00:01:57,840 --> 00:01:59,090 +from a human perspective! + +55 +00:02:00,000 --> 00:02:02,970 +For example, if our model +just generates the word six, + +56 +00:02:02,970 --> 00:02:05,020 +we get a perfect unigram precision score. + +57 +00:02:06,960 --> 00:02:09,930 +So to handle this, BLEU +uses a modified precision + +58 +00:02:09,930 --> 00:02:12,210 +that clips the number of +times to count a word, + +59 +00:02:12,210 --> 00:02:13,680 +based on the maximum number of times + +60 +00:02:13,680 --> 00:02:16,399 +it appears in the reference translation. + +61 +00:02:16,399 --> 00:02:18,630 +In this example, the word +six only appears once + +62 +00:02:18,630 --> 00:02:21,360 +in the reference, so we +clip the numerator to one + +63 +00:02:21,360 --> 00:02:22,710 +and the modified unigram precision + +64 +00:02:22,710 --> 00:02:25,233 +now gives a much lower score as expected. + +65 +00:02:27,660 --> 00:02:29,400 +Another problem with unigram precision + +66 +00:02:29,400 --> 00:02:30,780 +is that it doesn't take into account + +67 +00:02:30,780 --> 00:02:33,900 +the order in which the words +appear in the translations. + +68 +00:02:33,900 --> 00:02:35,700 +For example, suppose we had Yoda + +69 +00:02:35,700 --> 00:02:37,410 +translate our Spanish sentence, + +70 +00:02:37,410 --> 00:02:39,457 +then we might get +something backwards like, + +71 +00:02:39,457 --> 00:02:42,450 +"Years sixty thirty have I." + +72 +00:02:42,450 --> 00:02:44,670 +In this case, the +modified unigram precision + +73 +00:02:44,670 --> 00:02:47,393 +gives a high precision which +is not really what we want. + +74 +00:02:48,480 --> 00:02:50,460 +So to deal with word ordering problems, + +75 +00:02:50,460 --> 00:02:52,020 +BLEU actually computes the precision + +76 +00:02:52,020 --> 00:02:55,410 +for several different n-grams +and then averages the result. + +77 +00:02:55,410 --> 00:02:57,300 +For example, if we compare 4-grams, + +78 +00:02:57,300 --> 00:02:58,830 +we can see that there +are no matching chunks + +79 +00:02:58,830 --> 00:03:01,020 +of four words in the translations, + +80 +00:03:01,020 --> 00:03:02,913 +and so the 4-gram precision is 0. + +81 +00:03:05,460 --> 00:03:07,560 +Now, to compute BLEU +scores in Datasets library + +82 +00:03:07,560 --> 00:03:09,120 +is really very simple. + +83 +00:03:09,120 --> 00:03:11,100 +You just use the load_metric function, + +84 +00:03:11,100 --> 00:03:13,290 +provide your model's predictions +with their references + +85 +00:03:13,290 --> 00:03:14,390 +and you're good to go! + +86 +00:03:16,470 --> 00:03:19,200 +The output will contain +several fields of interest. + +87 +00:03:19,200 --> 00:03:20,490 +The precisions field contains + +88 +00:03:20,490 --> 00:03:23,133 +all the individual precision +scores for each n-gram. + +89 +00:03:25,050 --> 00:03:26,940 +The BLEU score itself is then calculated + +90 +00:03:26,940 --> 00:03:30,090 +by taking the geometric mean +of the precision scores. + +91 +00:03:30,090 --> 00:03:32,790 +And by default, the mean of +all four n-gram precisions + +92 +00:03:32,790 --> 00:03:35,793 +is reported, a metric that is +sometimes also called BLEU-4. + +93 +00:03:36,660 --> 00:03:38,880 +In this example, we can +see the BLEU score is zero + +94 +00:03:38,880 --> 00:03:40,780 +because the 4-gram precision was zero. + +95 +00:03:43,290 --> 00:03:45,390 +Now, the BLEU metric has +some nice properties, + +96 +00:03:45,390 --> 00:03:47,520 +but it is far from a perfect metric. + +97 +00:03:47,520 --> 00:03:49,440 +The good properties are +that it's easy to compute + +98 +00:03:49,440 --> 00:03:50,970 +and it's widely used in research + +99 +00:03:50,970 --> 00:03:52,620 +so you can compare your +model against others + +100 +00:03:52,620 --> 00:03:54,630 +on common benchmarks. + +101 +00:03:54,630 --> 00:03:56,670 +On the other hand, there are +several big problems with BLEU, + +102 +00:03:56,670 --> 00:03:58,830 +including the fact it +doesn't incorporate semantics + +103 +00:03:58,830 --> 00:04:01,920 +and it struggles a lot +on non-English languages. + +104 +00:04:01,920 --> 00:04:02,790 +Another problem with BLEU + +105 +00:04:02,790 --> 00:04:04,620 +is that it assumes the human translations + +106 +00:04:04,620 --> 00:04:05,820 +have already been tokenized + +107 +00:04:05,820 --> 00:04:07,320 +and this makes it hard to compare models + +108 +00:04:07,320 --> 00:04:08,820 +that use different tokenizers. + +109 +00:04:10,590 --> 00:04:12,570 +So as we've seen, measuring +the quality of texts + +110 +00:04:12,570 --> 00:04:15,570 +is still a difficult and +open problem in NLP research. + +111 +00:04:15,570 --> 00:04:17,580 +For machine translation, +the current recommendation + +112 +00:04:17,580 --> 00:04:19,440 +is to use the SacreBLEU metric, + +113 +00:04:19,440 --> 00:04:22,830 +which addresses the tokenization +limitations of BLEU. + +114 +00:04:22,830 --> 00:04:24,360 +As you can see in this example, + +115 +00:04:24,360 --> 00:04:26,580 +computing the SacreBLEU +score is almost identical + +116 +00:04:26,580 --> 00:04:28,020 +to the BLEU one. + +117 +00:04:28,020 --> 00:04:30,360 +The main difference is that +we now pass a list of texts + +118 +00:04:30,360 --> 00:04:32,640 +instead of a list of +words to the translations, + +119 +00:04:32,640 --> 00:04:35,640 +and SacreBLEU takes care of the +tokenization under the hood. + +120 +00:04:36,582 --> 00:04:39,499 +(screen whooshing) + diff --git a/subtitles/en/61_data-processing-for-summarization.srt b/subtitles/en/61_data-processing-for-summarization.srt index 49492b2c1..4ac57652f 100644 --- a/subtitles/en/61_data-processing-for-summarization.srt +++ b/subtitles/en/61_data-processing-for-summarization.srt @@ -1,104 +1,221 @@ -1 -00:00:05,360 --> 00:00:10,720 -Let's see how to preprocess a dataset  -for summarization. This is the task of   - -2 -00:00:10,720 --> 00:00:16,976 -well summarizing a long document. This video will  -focus on how to preprocess your dataset once you   - -3 -00:00:16,976 --> 00:00:21,840 -have managed to put it in the following format:  -one column for the long documents, and one for   - -4 -00:00:21,840 --> 00:00:27,360 -the summaries. Here is how we can achieve this  -with the Datasets library on the XSUM dataset.   - -5 -00:00:28,400 --> 00:00:32,400 -As long as you manage to have your data look like  -this, you should be able to follow the same steps.   - -6 -00:00:33,520 --> 00:00:37,280 -For once, our labels are not integers  -corresponding to some classes,   - -7 -00:00:37,280 --> 00:00:43,120 -but plain text. We will thus need to tokenize  -them, like our inputs. There is a small trap   - -8 -00:00:43,120 --> 00:00:47,760 -there though, as we need to tokenize our targets  -inside the as_target_tokenzier context manager.   - -9 -00:00:48,480 --> 00:00:53,200 -This is because the special tokens we add might be  -slightly different for the inputs and the targets,   - -10 -00:00:53,760 --> 00:00:58,320 -so the tokenizer has to know which one it  -is processing. Processing the whole dataset   - -11 -00:00:58,320 --> 00:01:03,520 -is then super easy with the map function. Since  -the summaries are usually much shorter than the   - -12 -00:01:03,520 --> 00:01:07,840 -documents, you should definitely pick different  -maximum lengths for the inputs and targets.   - -13 -00:01:08,640 --> 00:01:12,640 -You can choose to pad at this stage to that  -maximum length by setting padding=max_length.   - -14 -00:01:13,840 --> 00:01:17,360 -Here we will show you how to pad  -dynamically as it requires one more step.   - -15 -00:01:18,640 --> 00:01:23,360 -Your inputs and targets are all sentence of  -various lengths. We will pad the inputs and   - -16 -00:01:23,360 --> 00:01:27,920 -targets separately as the maximum length of the  -inputs and targets are completely different.   - -17 -00:01:28,880 --> 00:01:32,320 -Then we pad the inputs to the  -maximum lengths among the inputs,   - -18 -00:01:32,320 --> 00:01:38,800 -and same for the targets. We pad the inputs with  -the pad token and the targets with the -100 index,   - -19 -00:01:38,800 --> 00:01:44,400 -to make sure they are not taken into account in  -the loss computation. The Transformers library   - -20 -00:01:44,400 --> 00:01:49,200 -provides us with a data collator to do this  -all automatically. You can then pass it to   - -21 -00:01:49,200 --> 00:01:55,440 -the Trainer with your datasets, or use it in the  -to_tf_dataset method before using model.fit(). +1 +00:00:00,227 --> 00:00:01,359 +(air whooshing) + +2 +00:00:01,359 --> 00:00:02,610 +(smiley clicking) + +3 +00:00:02,610 --> 00:00:05,550 +(air whooshing) + +4 +00:00:05,550 --> 00:00:08,450 +- Let's see how to preprocess +a dataset for summarization. + +5 +00:00:09,750 --> 00:00:13,083 +This is the task of, well, +summarizing a long document. + +6 +00:00:14,040 --> 00:00:16,830 +This video will focus on how +to preprocess your dataset + +7 +00:00:16,830 --> 00:00:19,680 +once you have managed to put +it in the following format: + +8 +00:00:19,680 --> 00:00:21,510 +one column for the long documents, + +9 +00:00:21,510 --> 00:00:23,610 +and one for the summaries. + +10 +00:00:23,610 --> 00:00:24,930 +Here is how we can achieve this + +11 +00:00:24,930 --> 00:00:27,573 +with the Datasets library +on the XSUM dataset. + +12 +00:00:28,650 --> 00:00:30,810 +As long as you manage to have +your data look like this, + +13 +00:00:30,810 --> 00:00:33,690 +you should be able to +follow the same steps. + +14 +00:00:33,690 --> 00:00:35,880 +For once, our labels are not integers + +15 +00:00:35,880 --> 00:00:39,150 +corresponding to some +classes, but plain text. + +16 +00:00:39,150 --> 00:00:42,480 +We will thus need to tokenize +them, like our inputs. + +17 +00:00:42,480 --> 00:00:43,920 +There is a small trap there though, + +18 +00:00:43,920 --> 00:00:45,360 +as we need to tokenize our targets + +19 +00:00:45,360 --> 00:00:48,690 +inside the as_target_tokenizer +context manager. + +20 +00:00:48,690 --> 00:00:51,030 +This is because the special tokens we add + +21 +00:00:51,030 --> 00:00:54,000 +might be slightly different +for the inputs and the target, + +22 +00:00:54,000 --> 00:00:57,300 +so the tokenizer has to know +which one it is processing. + +23 +00:00:57,300 --> 00:00:59,550 +Processing the whole +dataset is then super easy + +24 +00:00:59,550 --> 00:01:01,290 +with the map function. + +25 +00:01:01,290 --> 00:01:03,450 +Since the summaries are +usually much shorter + +26 +00:01:03,450 --> 00:01:05,400 +than the documents, you +should definitely pick + +27 +00:01:05,400 --> 00:01:08,880 +different maximum lengths +for the inputs and targets. + +28 +00:01:08,880 --> 00:01:11,730 +You can choose to pad at this +stage to that maximum length + +29 +00:01:11,730 --> 00:01:14,070 +by setting padding=max_length. + +30 +00:01:14,070 --> 00:01:16,170 +Here we'll show you +how to pad dynamically, + +31 +00:01:16,170 --> 00:01:17,620 +as it requires one more step. + +32 +00:01:18,840 --> 00:01:20,910 +Your inputs and targets are all sentences + +33 +00:01:20,910 --> 00:01:22,620 +of various lengths. + +34 +00:01:22,620 --> 00:01:24,960 +We'll pad the inputs +and targets separately + +35 +00:01:24,960 --> 00:01:27,030 +as the maximum lengths +of the inputs and targets + +36 +00:01:27,030 --> 00:01:28,280 +are completely different. + +37 +00:01:29,130 --> 00:01:31,170 +Then, we pad the inputs +to the maximum lengths + +38 +00:01:31,170 --> 00:01:33,813 +among the inputs, and same for the target. + +39 +00:01:34,860 --> 00:01:36,630 +We pad the input with the pad token, + +40 +00:01:36,630 --> 00:01:39,000 +and the targets with the -100 index + +41 +00:01:39,000 --> 00:01:40,980 +to make sure they are +not taken into account + +42 +00:01:40,980 --> 00:01:42,180 +in the loss computation. + +43 +00:01:43,440 --> 00:01:45,180 +The Transformers library provide us + +44 +00:01:45,180 --> 00:01:48,510 +with a data collator to +do this all automatically. + +45 +00:01:48,510 --> 00:01:51,690 +You can then pass it to the +Trainer with your datasets, + +46 +00:01:51,690 --> 00:01:55,710 +or use it in the to_tf_dataset +method before using model.fit + +47 +00:01:55,710 --> 00:01:56,823 +on your current model. + +48 +00:01:58,339 --> 00:02:01,520 +(air whooshing) + +49 +00:02:01,520 --> 00:02:02,876 +(air whooshing) + diff --git a/subtitles/en/62_what-is-the-rouge-metric.srt b/subtitles/en/62_what-is-the-rouge-metric.srt index 4caff1f32..5450615b0 100644 --- a/subtitles/en/62_what-is-the-rouge-metric.srt +++ b/subtitles/en/62_what-is-the-rouge-metric.srt @@ -1,234 +1,455 @@ -1 -00:00:05,520 --> 00:00:12,080 -What is the ROUGE metric? For many NLP tasks we  -can use common metrics like accuracy or F1 score,   - -2 -00:00:12,080 --> 00:00:15,920 -but what do you do when you want to measure  -the quality of a summary from a model like T5?   - -3 -00:00:16,720 --> 00:00:20,265 -In this video, we'll take a look at a widely  -used metric for text summarization called ROUGE,   - -4 -00:00:20,265 --> 00:00:23,360 -which is short for Recall-Oriented Understudy for  -Gisting Evaluation. There are actually several   - -5 -00:00:23,360 --> 00:00:27,280 -variants of ROUGE, but the basic idea behind  -all of them is to assign a single numerical   - -6 -00:00:27,280 --> 00:00:31,360 -score to a summary that tells us how "good" it  -is compared to one or more reference summaries.   - -7 -00:00:32,320 --> 00:00:35,360 -In this example we have a book review  -that has been summarized by some model.   - -8 -00:00:36,400 --> 00:00:39,600 -If we compare the generated summary  -to some reference human summaries,   - -9 -00:00:39,600 --> 00:00:43,840 -we can see that the model is pretty  -good, and only differs by a word or two.   - -10 -00:00:44,800 --> 00:00:48,000 -So how can we measure the quality of a  -generated summary in an automatic way?   - -11 -00:00:48,800 --> 00:00:52,880 -The approach that ROUGE takes is to compare  -the n-grams of the generated summary to the   - -12 -00:00:52,880 --> 00:00:58,400 -n-grams of the references. An n-gram is just  -a fancy way of saying "a chunk of n words",   - -13 -00:00:58,400 --> 00:01:02,080 -so let's start with unigrams, which correspond  -to the individual words in a sentence.   - -14 -00:01:03,600 --> 00:01:07,760 -In this example you can see that six of the words  -in the generated summary are also found in one of   - -15 -00:01:07,760 --> 00:01:11,840 -the reference summaries. The ROUGE metric  -that compares unigrams is called ROUGE-1.   - -16 -00:01:14,000 --> 00:01:18,000 -Now that we've found our matches, one way to  -assign a score to the summary is to compute the   - -17 -00:01:18,000 --> 00:01:22,880 -recall of the unigrams. This means we just count  -the number of matching words in the generated and   - -18 -00:01:22,880 --> 00:01:27,040 -reference summaries and normalize the count by  -dividing by the number of word in the reference.   - -19 -00:01:28,000 --> 00:01:31,920 -In this example, we found 6 matching  -words and our reference has 6 words,   - -20 -00:01:31,920 --> 00:01:36,240 -so our unigram recall is perfect! This  -means that all of words in the reference   - -21 -00:01:36,240 --> 00:01:42,320 -summary have produced in the generated one.  -Perfect recall sounds great, but imagine if   - -22 -00:01:42,320 --> 00:01:47,120 -our generated summary had been “I really really  -really really loved reading the Hunger Games”.   - -23 -00:01:47,920 --> 00:01:52,240 -This would also have perfect recall, but is  -arguably a worse summary since it is verbose.   - -24 -00:01:53,280 --> 00:01:57,840 -To deal with these scenarios we can also compute  -precision, which in the ROUGE context measures   - -25 -00:01:57,840 --> 00:02:01,200 -how much of the generated summary was relevant. In  -this example, the precision is 6/7. In practice,   - -26 -00:02:01,200 --> 00:02:05,200 -both precision and recall are usually  -computed and then the F1-score is reported.   - -27 -00:02:07,360 --> 00:02:12,000 -We can change the granularity of the comparison  -by comparing bigrams instead of unigrams.   - -28 -00:02:12,800 --> 00:02:17,760 -With bigrams we chunk the sentence into pairs of  -consecutive words and then count how many pairs in   - -29 -00:02:17,760 --> 00:02:23,600 -the generated summary are present in the reference  -one. This gives us ROUGE-2 precision and recall,   - -30 -00:02:23,600 --> 00:02:28,800 -which we can see is lower than the ROUGE-1 scores  -we saw earlier. Note that if the summaries are   - -31 -00:02:28,800 --> 00:02:34,560 -long, the ROUGE-2 score will be small as there  -are typically fewer bigrams to match. This is   - -32 -00:02:34,560 --> 00:02:39,680 -also true for abstractive summarization, so both  -ROUGE-1 and ROUGE-2 scores are usually reported.   - -33 -00:02:41,760 --> 00:02:46,880 -The last ROUGE variant we'll discuss is  -ROUGE-L. ROUGE-L doesn't compare n-grams,   - -34 -00:02:46,880 --> 00:02:51,360 -but instead treats each summary as a sequence  -of words and then looks for the longest common   - -35 -00:02:51,360 --> 00:02:57,280 -subsequence or LCS. A subsequence is a sequence  -that appears in the same relative order,   - -36 -00:02:57,280 --> 00:03:03,280 -but not necessarily contiguous. So in this  -example, "I loved reading the Hunger Games" is the   - -37 -00:03:03,280 --> 00:03:11,120 -longest common subsequence. The main advantage of  -ROUGE-L over ROUGE-1 or ROUGE-2 is that is doesn't   - -38 -00:03:11,120 --> 00:03:18,400 -depend on consecutive n-gram matches, so it tends  -to capture sentence structure more accurately. To   - -39 -00:03:18,400 --> 00:03:23,200 -compute ROUGE scores in Hugging Face Datasets is  -very simple: just use the load_metric() function,   - -40 -00:03:23,760 --> 00:03:26,960 -provide your model's summaries along with  -the references and you're good to go!   - -41 -00:03:28,560 --> 00:03:32,480 -The output from the calculation contains  -a lot of information! The first thing we   - -42 -00:03:32,480 --> 00:03:36,880 -can see here is that the confidence intervals  -of each ROUGE score are provided in the low,   - -43 -00:03:36,880 --> 00:03:41,680 -mid, and high fields. This is really useful if you  -want to know the spread of your ROUGE scores when   - -44 -00:03:41,680 --> 00:03:48,080 -comparing two or more models. The second thing to  -notice is that we have four types of ROUGE score.   - -45 -00:03:48,080 --> 00:03:53,840 -We've already seen ROUGE-1, ROUGE-2 and  -ROUGE-L, so what is ROUGE-LSUM? Well,   - -46 -00:03:53,840 --> 00:03:58,800 -the “sum” in ROUGE-LSUM refers to the fact that  -this metric is computed over a whole summary,   - -47 -00:03:58,800 --> 00:04:08,480 -while ROUGE-L is computed as the  -average over individual sentences. +1 +00:00:00,624 --> 00:00:03,374 +(logo whooshing) + +2 +00:00:05,700 --> 00:00:07,740 +- What is the ROUGE metric? + +3 +00:00:07,740 --> 00:00:08,880 +For many NLP tasks + +4 +00:00:08,880 --> 00:00:12,270 +we can use common metrics +like accuracy or F1 score. + +5 +00:00:12,270 --> 00:00:13,650 +But what do you do when +you wanna measure something + +6 +00:00:13,650 --> 00:00:16,920 +like the quality of a +summary from a model like T5? + +7 +00:00:16,920 --> 00:00:18,180 +In this video, we'll take a look + +8 +00:00:18,180 --> 00:00:21,180 +at a widely used metric for +tech summarization called ROUGE. + +9 +00:00:22,740 --> 00:00:24,660 +There are actually +several variants of ROUGE + +10 +00:00:24,660 --> 00:00:26,190 +but the basic idea behind all of them + +11 +00:00:26,190 --> 00:00:27,840 +is to assign a single numerical score + +12 +00:00:27,840 --> 00:00:30,000 +to a summary that tells us how good it is + +13 +00:00:30,000 --> 00:00:32,774 +compared to one or more +reference summaries. + +14 +00:00:32,774 --> 00:00:34,020 +In this example, we have a book review + +15 +00:00:34,020 --> 00:00:36,570 +that has been summarized by some model. + +16 +00:00:36,570 --> 00:00:38,320 +If we compare the generated summary + +17 +00:00:39,168 --> 00:00:40,260 +to some reference human +summaries, we can see + +18 +00:00:40,260 --> 00:00:42,841 +that the model is actually pretty good + +19 +00:00:42,841 --> 00:00:44,063 +and only differs by a word or two. + +20 +00:00:45,060 --> 00:00:46,260 +So how can we measure the quality + +21 +00:00:46,260 --> 00:00:49,050 +of a generated summary +in an automatic way? + +22 +00:00:49,050 --> 00:00:51,510 +The approach that ROUGE takes +is to compare the n-grams + +23 +00:00:51,510 --> 00:00:55,200 +of the generated summary to +the n-grams of the references. + +24 +00:00:55,200 --> 00:00:58,590 +And n-gram is just a fancy way +of saying a chunk of N words. + +25 +00:00:58,590 --> 00:01:00,030 +So let's start with unigrams + +26 +00:01:00,030 --> 00:01:02,780 +which correspond to the +individual words in a sentence. + +27 +00:01:03,780 --> 00:01:05,250 +In this example, you can see that six + +28 +00:01:05,250 --> 00:01:07,650 +of the words in the generated +summary are also found + +29 +00:01:07,650 --> 00:01:09,420 +in one of the reference summaries. + +30 +00:01:09,420 --> 00:01:11,310 +And the rouge metric +that compares unigrams + +31 +00:01:11,310 --> 00:01:12,260 +is called ROUGE-1. + +32 +00:01:14,533 --> 00:01:16,770 +Now that we found our matches, +one way to assign a score + +33 +00:01:16,770 --> 00:01:20,280 +to the summary is to compute +the recall of the unigrams. + +34 +00:01:20,280 --> 00:01:21,540 +This means we just count the number + +35 +00:01:21,540 --> 00:01:22,950 +of matching words in the generated + +36 +00:01:22,950 --> 00:01:25,290 +and reference summaries +and normalize the count + +37 +00:01:25,290 --> 00:01:28,200 +by dividing by the number +of words in the reference. + +38 +00:01:28,200 --> 00:01:30,450 +In this example, we +found six matching words + +39 +00:01:30,450 --> 00:01:32,160 +and our reference has six words. + +40 +00:01:32,160 --> 00:01:33,933 +So our unigram recall is perfect. + +41 +00:01:34,800 --> 00:01:35,810 +This means that all of the words + +42 +00:01:35,810 --> 00:01:37,500 +in the reference summary +have been produced + +43 +00:01:37,500 --> 00:01:38,550 +in the generated one. + +44 +00:01:40,050 --> 00:01:42,360 +Now, perfect recall +sounds great, but imagine + +45 +00:01:42,360 --> 00:01:44,520 +if our generated summary +have been something like + +46 +00:01:44,520 --> 00:01:45,720 +I really, really, really, + +47 +00:01:45,720 --> 00:01:48,150 +really loved reading the Hunger Games. + +48 +00:01:48,150 --> 00:01:49,378 +This would also have perfect recall + +49 +00:01:49,378 --> 00:01:51,330 +but is arguably a worse summary, + +50 +00:01:51,330 --> 00:01:52,653 +since it is verbose. + +51 +00:01:53,550 --> 00:01:54,600 +To deal with these scenarios, + +52 +00:01:54,600 --> 00:01:56,190 +we can also compute precision, + +53 +00:01:56,190 --> 00:01:58,380 +which in the ROUGE +context measures how much + +54 +00:01:58,380 --> 00:02:00,810 +of the generator summary was relevant. + +55 +00:02:00,810 --> 00:02:03,630 +In practice, both precision +and recall are usually computed + +56 +00:02:03,630 --> 00:02:05,493 +and then the F1 score is reported. + +57 +00:02:07,170 --> 00:02:08,542 +Now we can change the granularity + +58 +00:02:08,542 --> 00:02:13,020 +of the comparison by comparing +bigrams instead of unigrams. + +59 +00:02:13,020 --> 00:02:15,090 +With bigrams, we chunk +the sentence into pairs + +60 +00:02:15,090 --> 00:02:17,910 +of consecutive words and +then count how many pairs + +61 +00:02:17,910 --> 00:02:21,360 +in the generated summary are +present in the reference one. + +62 +00:02:21,360 --> 00:02:23,880 +This gives us ROUGE-2 precision and recall + +63 +00:02:23,880 --> 00:02:24,780 +which as we can see, + +64 +00:02:24,780 --> 00:02:27,780 +is lower than the ROUGE-1 +scores from earlier. + +65 +00:02:27,780 --> 00:02:29,400 +Now, if the summaries are long, + +66 +00:02:29,400 --> 00:02:31,740 +the ROUGE-2 scores will generally be small + +67 +00:02:31,740 --> 00:02:34,290 +because there are fewer bios to match. + +68 +00:02:34,290 --> 00:02:36,870 +And this is also true for +abstracter summarization. + +69 +00:02:36,870 --> 00:02:39,993 +So both ROUGE-1 and ROUGE-2 +scores are usually reported. + +70 +00:02:42,000 --> 00:02:45,330 +The last ROUGE variant we +will discuss is ROUGE L. + +71 +00:02:45,330 --> 00:02:47,160 +ROUGE L doesn't compare ngrams + +72 +00:02:47,160 --> 00:02:49,572 +but instead treats each +summary as a sequence of words + +73 +00:02:49,572 --> 00:02:53,403 +and then looks for the longest +common subsequence or LCS. + +74 +00:02:54,775 --> 00:02:56,130 +A subsequence is a sequence that appears + +75 +00:02:56,130 --> 00:02:59,760 +in the same relative order, +but not necessarily contiguous. + +76 +00:02:59,760 --> 00:03:03,210 +So in this example, I loved +reading the Hunger Games, + +77 +00:03:03,210 --> 00:03:06,930 +is the longest common subsequence +between the two summaries. + +78 +00:03:06,930 --> 00:03:08,610 +And the main advantage of ROUGE L + +79 +00:03:08,610 --> 00:03:11,670 +over ROUGE-1 or ROUGE-2 +is that it doesn't depend + +80 +00:03:11,670 --> 00:03:14,100 +on consecutive n-gram +matches, and so it tends + +81 +00:03:14,100 --> 00:03:16,650 +to capture sentence structure +much more accurately. + +82 +00:03:18,150 --> 00:03:19,440 +Now to compute ROUGE scores + +83 +00:03:19,440 --> 00:03:21,660 +in the data sets library is very simple. + +84 +00:03:21,660 --> 00:03:23,910 +You just use the load metric function, + +85 +00:03:23,910 --> 00:03:26,400 +provide your model summaries +along with the references + +86 +00:03:26,400 --> 00:03:27,500 +and you're good to go. + +87 +00:03:28,770 --> 00:03:30,120 +The output from the calculation + +88 +00:03:30,120 --> 00:03:31,507 +contains a lot of information. + +89 +00:03:31,507 --> 00:03:34,560 +The first thing we can see is +that the confidence intervals + +90 +00:03:34,560 --> 00:03:36,090 +of each ROUGE score are provided + +91 +00:03:36,090 --> 00:03:39,030 +in the low, mid and high fields. + +92 +00:03:39,030 --> 00:03:40,980 +This is really useful if +you wanna know the spread + +93 +00:03:40,980 --> 00:03:43,730 +of your ROUGE scores when +comparing two or more models. + +94 +00:03:45,090 --> 00:03:46,050 +The second thing to notice + +95 +00:03:46,050 --> 00:03:48,330 +is that we have four types of ROUGE score. + +96 +00:03:48,330 --> 00:03:51,480 +We've already seen ROUGE-1, +ROUGE-2 and ROUGE-L + +97 +00:03:51,480 --> 00:03:53,760 +So what is ROUGE-L sum? + +98 +00:03:53,760 --> 00:03:55,410 +Well, the sum in ROUGEL's sum + +99 +00:03:55,410 --> 00:03:57,630 +refers to the fact that +this metric is computed + +100 +00:03:57,630 --> 00:04:00,240 +over a whole summary +while ROUGE-L is computed + +101 +00:04:00,240 --> 00:04:02,493 +as the average of individual sentences. + +102 +00:04:04,166 --> 00:04:06,916 +(logo whooshing) + diff --git a/subtitles/en/63_data-processing-for-causal-language-modeling.srt b/subtitles/en/63_data-processing-for-causal-language-modeling.srt index 967ddc207..d5d544dee 100644 --- a/subtitles/en/63_data-processing-for-causal-language-modeling.srt +++ b/subtitles/en/63_data-processing-for-causal-language-modeling.srt @@ -1,214 +1,415 @@ -1 -00:00:05,520 --> 00:00:09,360 -In this video we take a look at the  -data processing necessary to train   - -2 -00:00:09,360 --> 00:00:15,920 -causal language models. Causal Language Modeling  -is the task of predicting the next token based   - -3 -00:00:15,920 --> 00:00:20,880 -on the previous token. Another term for Causal  -Language Modeling is Autoregressive Modeling.   - -4 -00:00:21,760 --> 00:00:26,560 -In the example that you see here the  -next token could for example be NLP   - -5 -00:00:26,560 --> 00:00:33,280 -or machine learning. A popular example of a  -Causal Language Model is the GPT family of models.   - -6 -00:00:35,680 --> 00:00:40,400 -To train such models such as GPT-2 we usually  -start with a large corpus of text files.   - -7 -00:00:41,280 --> 00:00:45,760 -These files can webpages scraped from the  -internet such as the Common Crawl dataset   - -8 -00:00:45,760 --> 00:00:51,920 -or they can be Python files from GitHub like you  -can see here. As a first step we need to tokenize   - -9 -00:00:51,920 --> 00:00:57,520 -these files such that we can feed them through a  -model. Here we show the tokenized texts as bars of   - -10 -00:00:57,520 --> 00:01:06,000 -various length illustrating the different sequence  -lengths. Normally, the text files come in various   - -11 -00:01:06,000 --> 00:01:07,440 -sizes and which results in various sequence length  -of the tokenized texts. Transformer models have a   - -12 -00:01:07,440 --> 00:01:12,960 -limited context length and depending on the data  -source it is possible that the tokenized texts   - -13 -00:01:12,960 --> 00:01:18,640 -are much longer than this context length. In  -this case we could just truncate the sequence   - -14 -00:01:18,640 --> 00:01:24,160 -to the context length but this would mean that  -we loose everything after the context length.   - -15 -00:01:25,360 --> 00:01:30,960 -Using the return overflowing tokens flag in the  -we can use the tokenizer to create chunks with   - -16 -00:01:30,960 --> 00:01:36,960 -each one being the size of the context length.  -Sometimes it can happen that the last chunk is   - -17 -00:01:36,960 --> 00:01:41,440 -too short if there aren’t enough tokens to fill  -it. In this case we would like to remove it.   - -18 -00:01:43,440 --> 00:01:48,800 -With the return_length keyword we also get  -the length of each chunk from the tokenizer.   - -19 -00:01:51,760 --> 00:01:57,280 -This function shows all the steps necessary  -to prepare the dataset. First we tokenize the   - -20 -00:01:57,280 --> 00:02:03,520 -dataset with the flags I just mentioned. Then we  -go through each chunk and if its length matches   - -21 -00:02:03,520 --> 00:02:08,960 -the context length we add it to the inputs we  -return. We can apply this function to the whole   - -22 -00:02:08,960 --> 00:02:17,520 -dataset and we make sure to use batches and remove  -the existing columns. We need to remove columns   - -23 -00:02:17,520 --> 00:02:23,280 -because we can create multiple samples per text  -and the shapes in the dataset would not match.   - -24 -00:02:26,960 --> 00:02:32,400 -If the context length is of similar length as  -the files this approach doesn't so well anymore.   - -25 -00:02:33,520 --> 00:02:39,440 -In this example both sample 1 and 2 are shorter  -than the context size and would be discarded with   - -26 -00:02:39,440 --> 00:02:46,400 -the previous approach. In this case it is better  -to first tokenize each sample without truncation   - -27 -00:02:46,400 --> 00:02:52,000 -and then concatenate the tokenized samples with an  -end of string, or EOS for short, token in between.   - -28 -00:02:53,840 --> 00:02:57,440 -Finally we can chunk this long  -sequence with the context length   - -29 -00:02:57,440 --> 00:03:05,840 -and we don’t loose any sequences because they  -are too short. So far we have only talked about   - -30 -00:03:05,840 --> 00:03:10,720 -the inputs for causal language modeling but  -not the labels needed for supervised training.   - -31 -00:03:11,600 --> 00:03:16,480 -When we do causal language modeling we don’t  -require any extra labels for the input sequences   - -32 -00:03:16,480 --> 00:03:22,080 -as the input sequences themselves are  -the labels. In this example when we feed   - -33 -00:03:22,080 --> 00:03:26,560 -the token “Trans” to the next token we  -want the model to predict is “formers”.   - -34 -00:03:27,280 --> 00:03:33,360 -In the next step we feed “Trans” and “formers”  -to the model and the label is the token “are”.   - -35 -00:03:35,280 --> 00:03:42,400 -This pattern continues and as you can see the  -input sequence is the label just shifted by one.   - -36 -00:03:43,440 --> 00:03:48,000 -Since the model only makes a prediction  -after the first token, the first element   - -37 -00:03:48,000 --> 00:03:54,480 -of the input sequence, in this case “Trans”,  -is not used as a label. Similarly, we do not   - -38 -00:03:54,480 --> 00:04:00,400 -have a label for the last token in the sequence  -since there is no token after the sequence ends.   - -39 -00:04:03,920 --> 00:04:09,200 -Let’s have a look at what we need to do to create  -the labels for causal language modeling in code.If   - -40 -00:04:10,160 --> 00:04:15,600 -we want to calculate the loss on a batch we can  -just pass the input_ids as labels and all the   - -41 -00:04:15,600 --> 00:04:19,432 -shifting is handled in the model internally. And  -the dataset is also ready to be used directly in   - -42 -00:04:19,432 --> 00:04:21,600 -the Trainer or keras.fit if you are using  -TensorFlow. So you see there is no magic   - -43 -00:04:21,600 --> 00:04:27,840 -involved in processing data for causal language  -modeling and only requires a few simple steps! +1 +00:00:00,000 --> 00:00:02,917 +(transition music) + +2 +00:00:05,364 --> 00:00:08,310 +- In this video, we take a +look at the data processing + +3 +00:00:08,310 --> 00:00:10,803 +necessary to train causal language models. + +4 +00:00:12,690 --> 00:00:14,400 +Causal language modeling is the task + +5 +00:00:14,400 --> 00:00:17,820 +of predicting the next token +based on the previous ones. + +6 +00:00:17,820 --> 00:00:19,680 +Another term for causal language modeling + +7 +00:00:19,680 --> 00:00:21,000 +is autoregressive modeling. + +8 +00:00:21,000 --> 00:00:23,940 +In the example that you can see here, + +9 +00:00:23,940 --> 00:00:25,560 +the next token could, for example, + +10 +00:00:25,560 --> 00:00:28,263 +be NLP or it could be machine learning. + +11 +00:00:29,460 --> 00:00:31,457 +A popular example of +causal language models + +12 +00:00:31,457 --> 00:00:33,693 +is the GPT family of models. + +13 +00:00:35,561 --> 00:00:38,010 +To train models such as GPT, + +14 +00:00:38,010 --> 00:00:41,460 +we usually start with a +large corpus of text files. + +15 +00:00:41,460 --> 00:00:43,890 +These files can be webpages +scraped from the internet + +16 +00:00:43,890 --> 00:00:46,020 +such as the Common Crawl dataset + +17 +00:00:46,020 --> 00:00:47,940 +or they can be Python files from GitHub, + +18 +00:00:47,940 --> 00:00:49,490 +like the ones you can see here. + +19 +00:00:50,400 --> 00:00:52,680 +As a first step, we need +to tokenize these files + +20 +00:00:52,680 --> 00:00:55,380 +such that we can feed +them through the model. + +21 +00:00:55,380 --> 00:00:58,500 +Here, we show the tokenized +texts as bars of various length, + +22 +00:00:58,500 --> 00:01:02,188 +illustrating that they're +shorter and longer ones. + +23 +00:01:02,188 --> 00:01:05,910 +This is very common +when working with text. + +24 +00:01:05,910 --> 00:01:09,270 +However, transform models +have a limited context window + +25 +00:01:09,270 --> 00:01:10,770 +and depending on the data source, + +26 +00:01:10,770 --> 00:01:13,140 +it is possible that the tokenized texts + +27 +00:01:13,140 --> 00:01:15,183 +are much longer than this window. + +28 +00:01:16,080 --> 00:01:18,870 +In this case, we could +just truncate the sequences + +29 +00:01:18,870 --> 00:01:20,182 +to the context length, + +30 +00:01:20,182 --> 00:01:22,650 +but this would mean +that we lose everything + +31 +00:01:22,650 --> 00:01:24,513 +after the first context window. + +32 +00:01:25,500 --> 00:01:28,410 +Using the return overflowing token flag, + +33 +00:01:28,410 --> 00:01:30,960 +we can use the tokenizer to create chunks + +34 +00:01:30,960 --> 00:01:33,510 +with each one being the +size of the context length. + +35 +00:01:34,860 --> 00:01:36,180 +Sometimes, it can still happen + +36 +00:01:36,180 --> 00:01:37,590 +that the last chunk is too short + +37 +00:01:37,590 --> 00:01:39,900 +if there aren't enough tokens to fill it. + +38 +00:01:39,900 --> 00:01:41,793 +In this case, we can just remove it. + +39 +00:01:42,990 --> 00:01:45,960 +With the return_length keyword, + +40 +00:01:45,960 --> 00:01:49,173 +we also get the length of +each chunk from the tokenizer. + +41 +00:01:51,960 --> 00:01:53,640 +This function shows all the steps + +42 +00:01:53,640 --> 00:01:56,280 +necessary to prepare the dataset. + +43 +00:01:56,280 --> 00:01:57,960 +First, we tokenize the dataset + +44 +00:01:57,960 --> 00:02:00,330 +with the flags I just mentioned. + +45 +00:02:00,330 --> 00:02:02,190 +Then, we go through each chunk + +46 +00:02:02,190 --> 00:02:04,680 +and if it's length matches +the context length, + +47 +00:02:04,680 --> 00:02:06,663 +we add it to the inputs we return. + +48 +00:02:07,590 --> 00:02:10,260 +We can apply this function +to the whole dataset. + +49 +00:02:10,260 --> 00:02:11,700 +In addition, we make sure + +50 +00:02:11,700 --> 00:02:15,450 +that to use batches and +remove the existing columns. + +51 +00:02:15,450 --> 00:02:17,670 +We need to remove the existing columns, + +52 +00:02:17,670 --> 00:02:21,330 +because we can create +multiple samples per text, + +53 +00:02:21,330 --> 00:02:22,890 +and the shapes in the dataset + +54 +00:02:22,890 --> 00:02:24,753 +would not match anymore in that case. + +55 +00:02:26,832 --> 00:02:30,330 +If the context length is of +similar lengths as the files, + +56 +00:02:30,330 --> 00:02:32,733 +this approach doesn't +work so well anymore. + +57 +00:02:33,660 --> 00:02:36,420 +In this example, both sample 1 and 2 + +58 +00:02:36,420 --> 00:02:38,400 +are shorter than the context size + +59 +00:02:38,400 --> 00:02:41,610 +and will be discarded with +the previous approach. + +60 +00:02:41,610 --> 00:02:45,150 +In this case, it is better +to first tokenize each sample + +61 +00:02:45,150 --> 00:02:46,590 +without truncation + +62 +00:02:46,590 --> 00:02:49,290 +and then concatenate the tokenized samples + +63 +00:02:49,290 --> 00:02:52,353 +with an end of string +or EOS token in between. + +64 +00:02:53,546 --> 00:02:56,220 +Finally, we can chunk this long sequence + +65 +00:02:56,220 --> 00:02:59,490 +with the context length and we +don't lose too many sequences + +66 +00:02:59,490 --> 00:03:01,263 +because they're too short anymore. + +67 +00:03:04,170 --> 00:03:05,760 +So far, we have only talked + +68 +00:03:05,760 --> 00:03:08,370 +about the inputs for +causal language modeling, + +69 +00:03:08,370 --> 00:03:11,850 +but not the labels needed +for supervised training. + +70 +00:03:11,850 --> 00:03:13,380 +When we do causal language modeling, + +71 +00:03:13,380 --> 00:03:16,710 +we don't require any extra +labels for the input sequences + +72 +00:03:16,710 --> 00:03:20,610 +as the input sequences +themselves are the labels. + +73 +00:03:20,610 --> 00:03:24,240 +In this example, when we feed +the token trans to the model, + +74 +00:03:24,240 --> 00:03:27,510 +the next token we wanted +to predict is formers. + +75 +00:03:27,510 --> 00:03:30,780 +In the next step, we feed +trans and formers to the model + +76 +00:03:30,780 --> 00:03:33,903 +and the label we wanted to predict is are. + +77 +00:03:35,460 --> 00:03:38,130 +This pattern continues, +and as you can see, + +78 +00:03:38,130 --> 00:03:41,220 +the input sequence is the label sequence + +79 +00:03:41,220 --> 00:03:42,663 +just shifted by one. + +80 +00:03:43,590 --> 00:03:47,310 +Since the model only makes +prediction after the first token, + +81 +00:03:47,310 --> 00:03:49,350 +the first element of the input sequence, + +82 +00:03:49,350 --> 00:03:52,980 +in this case, trans, +is not used as a label. + +83 +00:03:52,980 --> 00:03:55,530 +Similarly, we don't have a label + +84 +00:03:55,530 --> 00:03:57,600 +for the last token in the sequence + +85 +00:03:57,600 --> 00:04:00,843 +since there is no token +after the sequence ends. + +86 +00:04:04,110 --> 00:04:06,300 +Let's have a look at what we need to do + +87 +00:04:06,300 --> 00:04:10,200 +to create the labels for causal +language modeling in code. + +88 +00:04:10,200 --> 00:04:12,360 +If we want to calculate a loss on a batch, + +89 +00:04:12,360 --> 00:04:15,120 +we can just pass the input_ids as labels + +90 +00:04:15,120 --> 00:04:18,933 +and all the shifting is handled +in the model internally. + +91 +00:04:20,032 --> 00:04:22,170 +So, you see, there's no matching involved + +92 +00:04:22,170 --> 00:04:24,870 +in processing data for +causal language modeling, + +93 +00:04:24,870 --> 00:04:27,723 +and it only requires a few simple steps. + +94 +00:04:28,854 --> 00:04:31,771 +(transition music) + diff --git a/subtitles/en/64_using-a-custom-loss-function.srt b/subtitles/en/64_using-a-custom-loss-function.srt index 71fa3c485..bd75982b9 100644 --- a/subtitles/en/64_using-a-custom-loss-function.srt +++ b/subtitles/en/64_using-a-custom-loss-function.srt @@ -1,169 +1,325 @@ -1 -00:00:05,440 --> 00:00:09,040 -In this video we take a look at setting  -up a custom loss function for training.   - -2 -00:00:10,800 --> 00:00:14,800 -In the default loss functions all  -samples such as these code snippets   - -3 -00:00:14,800 --> 00:00:19,040 -are treated the same irrespective of their  -content, but there are scenarios where you it   - -4 -00:00:19,040 --> 00:00:22,880 -could make sense to weight the samples  -differently. If for example one sample   - -5 -00:00:22,880 --> 00:00:28,800 -contains a lot of tokens that or of interest to  -us or it has a favourable diversity of tokens.   - -6 -00:00:29,680 --> 00:00:33,520 -We can also think of other heuristics we can  -implement with pattern matching or other rules.   - -7 -00:00:36,080 --> 00:00:40,400 -For each sample we get a loss value during  -training and we can combine that loss with   - -8 -00:00:40,400 --> 00:00:47,200 -a weight. Then we can for example create a  -weighted sum to get the final loss for a batch.   - -9 -00:00:48,480 --> 00:00:53,280 -Let’s have a look at a specific example: we  -want to setup a language model that helps us   - -10 -00:00:53,280 --> 00:01:00,800 -autocomplete complete common data science code.  -For that task we would like to weight samples   - -11 -00:01:00,800 --> 00:01:06,960 -stronger where tokens related to the data science  -stack, such as pd or np, occur more frequently.   - -12 -00:01:10,000 --> 00:01:14,788 -Here you see a loss function that does exactly  -that for causal language modeling. It takes the   - -13 -00:01:14,788 --> 00:01:22,800 -models it takes the model’s inputs and predicted  -logits as well as the key tokens as input. First   - -14 -00:01:22,800 --> 00:01:30,320 -the inputs and logits are aligned, then the loss  -per sample is calculate followed by the weights.   - -15 -00:01:32,320 --> 00:01:35,280 -Finally the loss and weights  -are combined and returned.   - -16 -00:01:36,320 --> 00:01:40,480 -This is a pretty big function so let’s take  -a closer look at the loss and weight blocks.   - -17 -00:01:43,200 --> 00:01:47,920 -During the calculation of the standard loss the  -logits and labels are flattened over the batch.   - -18 -00:01:48,720 --> 00:01:53,280 -With the view we unflatten the tensor  -to get a matrix with a row for each   - -19 -00:01:53,280 --> 00:01:57,280 -sample in the batch and a column for each  -position in the sequence of the samples.   - -20 -00:01:58,720 --> 00:02:03,600 -We don’t need the loss per position so we average  -the loss over all positions for each sample.   - -21 -00:02:06,000 --> 00:02:10,960 -For the weights we use boolean logic to get  -a tensor with 1s where a keyword occurred   - -22 -00:02:10,960 --> 00:02:17,840 -and 0s where not. This tensor has an additional  -dimension as the loss tensor we just saw because   - -23 -00:02:17,840 --> 00:02:24,480 -we get the information for each keyword in a  -separate matrix. Only want to know how many   - -24 -00:02:24,480 --> 00:02:30,320 -times keywords occurred per sample so we can sum  -over all keywords and all positions per sample.   - -25 -00:02:33,280 --> 00:02:39,760 -Now we are almost there, we only need to combine  -the loss with the weight per sample. We do this   - -26 -00:02:39,760 --> 00:02:43,920 -with element wise multiplication and then  -average over all samples in the batch.   - -27 -00:02:44,720 --> 00:02:48,000 -In the end we have exactly one  -loss value for the whole batch.   - -28 -00:02:48,880 --> 00:02:52,800 -And this is the whole necessary logic  -to create a custom weighted loss.   - -29 -00:02:56,080 --> 00:03:02,640 -Let’s see how we can make use of that custom loss  -with Accelerate and the Trainer In Accelerate we   - -30 -00:03:02,640 --> 00:03:07,680 -just pass the input_ids to the models to get the  -logits and can then call the custom loss function.   - -31 -00:03:08,800 --> 00:03:12,800 -After that we continue with the normal  -training loop by for example calling backward.   - -32 -00:03:13,840 --> 00:03:19,200 -For the Trainer we can overwrite the compute  -loss function of the standard trainer. We   - -33 -00:03:19,200 --> 00:03:23,360 -just need to make sure that that we return the  -loss and the model outputs in the same format.   - -34 -00:03:24,240 --> 00:03:31,840 -With that you can integrate your own awesome loss  -function with both the trainer and accelerates. +1 +00:00:00,573 --> 00:00:01,636 +(air whooshing) + +2 +00:00:01,636 --> 00:00:02,594 +(logo popping) + +3 +00:00:02,594 --> 00:00:05,550 +(metal sliding) + +4 +00:00:05,550 --> 00:00:07,500 +- In this video, we take +a look at setting up + +5 +00:00:07,500 --> 00:00:09,303 +a custom loss function for training. + +6 +00:00:10,980 --> 00:00:13,260 +In the default loss function, all samples, + +7 +00:00:13,260 --> 00:00:15,840 +such as these code snippets, +are treated the same + +8 +00:00:15,840 --> 00:00:18,960 +irrespective of their content +but there are scenarios + +9 +00:00:18,960 --> 00:00:21,660 +where it could make sense to +weight the samples differently. + +10 +00:00:21,660 --> 00:00:24,570 +If, for example, one sample +contains a lot of tokens + +11 +00:00:24,570 --> 00:00:26,160 +that are of interest to us + +12 +00:00:26,160 --> 00:00:29,910 +or if a sample has a +favorable diversity of tokens. + +13 +00:00:29,910 --> 00:00:31,950 +We can also implement other heuristics + +14 +00:00:31,950 --> 00:00:33,963 +with pattern matching or other rules. + +15 +00:00:35,993 --> 00:00:39,150 +For each sample, we get a +loss value during training + +16 +00:00:39,150 --> 00:00:41,850 +and we can combine that +loss with a weight. + +17 +00:00:41,850 --> 00:00:43,860 +Then we can create a weighted sum + +18 +00:00:43,860 --> 00:00:45,660 +or average over all samples + +19 +00:00:45,660 --> 00:00:47,613 +to get the final loss for the batch. + +20 +00:00:48,690 --> 00:00:51,240 +Let's have a look at a specific example. + +21 +00:00:51,240 --> 00:00:52,830 +We want to set up a language model + +22 +00:00:52,830 --> 00:00:56,073 +that helps us autocomplete +common data science code. + +23 +00:00:57,030 --> 00:01:01,830 +For that task, we would like +to weight samples stronger + +24 +00:01:01,830 --> 00:01:04,110 +where tokens related to +the data science stack, + +25 +00:01:04,110 --> 00:01:07,353 +such as pd or np, occur more frequently. + +26 +00:01:10,140 --> 00:01:13,080 +Here you see a loss function +that does exactly that + +27 +00:01:13,080 --> 00:01:15,180 +for causal language modeling. + +28 +00:01:15,180 --> 00:01:18,030 +It takes the model's input +and predicted logits, + +29 +00:01:18,030 --> 00:01:20,343 +as well as the key tokens, as input. + +30 +00:01:21,869 --> 00:01:25,113 +First, the inputs and logits are aligned. + +31 +00:01:26,490 --> 00:01:29,310 +Then the loss per sample is calculated, + +32 +00:01:29,310 --> 00:01:30,843 +followed by the weights. + +33 +00:01:32,430 --> 00:01:35,583 +Finally, the loss and the weights +are combined and returned. + +34 +00:01:36,540 --> 00:01:39,150 +This is a pretty big function, +so let's take a closer look + +35 +00:01:39,150 --> 00:01:40,953 +at the loss and the weight blocks. + +36 +00:01:43,380 --> 00:01:45,600 +During the calculation +of the standard loss, + +37 +00:01:45,600 --> 00:01:48,930 +the logits and labels are +flattened over the batch. + +38 +00:01:48,930 --> 00:01:52,590 +With the view, we unflatten +the tensor to get the matrix + +39 +00:01:52,590 --> 00:01:55,320 +with a row for each sample +in the batch and a column + +40 +00:01:55,320 --> 00:01:57,723 +for each position in the +sequence of the sample. + +41 +00:01:58,920 --> 00:02:00,600 +We don't need the loss per position, + +42 +00:02:00,600 --> 00:02:04,083 +so we average the loss over +all positions for each sample. + +43 +00:02:06,150 --> 00:02:08,970 +For the weights, we use +Boolean logic to get a tensor + +44 +00:02:08,970 --> 00:02:12,483 +with 1s where a keyword +occurred and 0s where not. + +45 +00:02:13,440 --> 00:02:15,690 +This tensor has an additional dimension + +46 +00:02:15,690 --> 00:02:18,540 +as the loss tensor we +just saw because we get + +47 +00:02:18,540 --> 00:02:21,693 +the information for each +keyword in a separate matrix. + +48 +00:02:22,770 --> 00:02:24,120 +We only want to know + +49 +00:02:24,120 --> 00:02:26,880 +how many times keywords +occurred per sample, + +50 +00:02:26,880 --> 00:02:30,693 +so we can sum overall keywords +and all positions per sample. + +51 +00:02:33,450 --> 00:02:35,010 +Now we're almost there. + +52 +00:02:35,010 --> 00:02:38,850 +We only need to combine the +loss with the weight per sample. + +53 +00:02:38,850 --> 00:02:41,790 +We do this with element +wise multiplication + +54 +00:02:41,790 --> 00:02:45,233 +and then average overall +samples in the batch. + +55 +00:02:45,233 --> 00:02:46,066 +In the end, + +56 +00:02:46,066 --> 00:02:49,110 +we have exactly one loss +value for the whole batch + +57 +00:02:49,110 --> 00:02:51,330 +and this is the whole necessary logic + +58 +00:02:51,330 --> 00:02:53,223 +to create a custom weighted loss. + +59 +00:02:56,250 --> 00:02:59,010 +Let's see how we can make +use of that custom loss + +60 +00:02:59,010 --> 00:03:00,753 +with Accelerate and the Trainer. + +61 +00:03:01,710 --> 00:03:04,656 +In Accelerate, we just pass the input_ids + +62 +00:03:04,656 --> 00:03:05,730 +to the model to get the logits + +63 +00:03:05,730 --> 00:03:08,103 +and then we can call the +custom loss function. + +64 +00:03:09,000 --> 00:03:11,310 +After that, we continue with +the normal training loop + +65 +00:03:11,310 --> 00:03:13,083 +by, for example, calling backward. + +66 +00:03:14,010 --> 00:03:15,570 +For the Trainer, we can overwrite + +67 +00:03:15,570 --> 00:03:19,260 +the compute loss function +of the standard trainer. + +68 +00:03:19,260 --> 00:03:20,970 +We just need to make sure that we return + +69 +00:03:20,970 --> 00:03:24,450 +the loss and the model +outputs in the same format. + +70 +00:03:24,450 --> 00:03:27,570 +With that, you can integrate +your own awesome loss function + +71 +00:03:27,570 --> 00:03:29,763 +with both the Trainer and Accelerate. + +72 +00:03:31,389 --> 00:03:34,056 +(air whooshing) + diff --git a/subtitles/en/65_data-processing-for-question-answering.srt b/subtitles/en/65_data-processing-for-question-answering.srt index 81bc4280f..c0ea48326 100644 --- a/subtitles/en/65_data-processing-for-question-answering.srt +++ b/subtitles/en/65_data-processing-for-question-answering.srt @@ -1,185 +1,277 @@ -1 -00:00:05,569 --> 00:00:10,490 -Let's study how to preprocess a dataset for -question answering! - -2 -00:00:10,490 --> 00:00:14,260 -Question answering is the task of finding -answers to a question in some context. - -3 -00:00:14,260 --> 00:00:19,970 -For our example, we will use the squad dataset, -in which we remove columns we won't use and - -4 -00:00:19,970 --> 00:00:24,390 -just extract the information we will need -for the labels: the start and the end of the - -5 -00:00:24,390 --> 00:00:25,390 -answer in the context. - -6 -00:00:25,390 --> 00:00:30,279 -If you have your own dataset for question -answering, just make sure you clean your data - -7 -00:00:30,279 --> 00:00:34,800 -to get to the same point, with one column -containing the questions, one column containing - -8 -00:00:34,800 --> 00:00:39,350 -the contexts, one column for the index of -the start and end character of the answer - -9 -00:00:39,350 --> 00:00:41,700 -in the context. - -10 -00:00:41,700 --> 00:00:44,610 -Note that the answer must be part of the context. - -11 -00:00:44,610 --> 00:00:48,360 -If you want to perform generative question -answering, look at one of the sequence to - -12 -00:00:48,360 --> 00:00:50,890 -sequence videos linked below. - -13 -00:00:50,890 --> 00:00:55,860 -Now if we have a look at the tokens we will -feed our model we will see the answer lies - -14 -00:00:55,860 --> 00:00:58,450 -somewhere inside the context. - -15 -00:00:58,450 --> 00:01:02,239 -For very long context that answer may get -truncated by the tokenizer. - -16 -00:01:02,239 --> 00:01:06,050 -In this case, we wont have any proper labels -for our model. - -17 -00:01:06,050 --> 00:01:11,159 -So we should keep the truncated part as a -separate feature instead of discarding it. - -18 -00:01:11,159 --> 00:01:14,720 -The only thing we need to be careful with, -is to allow some overlap between separate - -19 -00:01:14,720 --> 00:01:19,900 -chunks so that the answer is not truncated, -and that the feature containing the answer - -20 -00:01:19,900 --> 00:01:22,670 -gets sufficient context to be able to predict -it. - -21 -00:01:22,670 --> 00:01:28,790 -Here is how it can be done by the tokenizer: -we pass it the question, context, set the - -22 -00:01:28,790 --> 00:01:32,750 -truncation for the context only and the padding -to the maximum length. - -23 -00:01:32,750 --> 00:01:39,590 -The stride argument is where we set the number -of overlapping tokens, and the return_overflowing_tokens - -24 -00:01:39,590 --> 00:01:42,869 -means we don't want to discard the truncated -part. - -25 -00:01:42,869 --> 00:01:47,140 -Lastly, we also return the offset mappings -to be able to find the tokens corresponding - -26 -00:01:47,140 --> 00:01:48,649 -to the answer start and end. - -27 -00:01:48,649 --> 00:01:53,990 -We want those two tokens, because there will -be the labels we pass to our model. - -28 -00:01:53,990 --> 00:01:57,200 -In a one-hot encoded version, here is what -they look like. - -29 -00:01:57,200 --> 00:02:02,119 -If the context we have does not contain the -answer, we set the two labels to the index - -30 -00:02:02,119 --> 00:02:04,329 -of the CLS token. - -31 -00:02:04,329 --> 00:02:08,629 -We also do this if the context only partially -contains the answer. - -32 -00:02:08,629 --> 00:02:13,950 -In terms of code, here is how we can do it: -using the sequence IDs of an input, we can - -33 -00:02:13,950 --> 00:02:17,390 -determine the beginning and the end of the -context. - -34 -00:02:17,390 --> 00:02:22,290 -Then we know if have to return the CLS position -for the two labels or we determine the positions - -35 -00:02:22,290 --> 00:02:25,120 -of the first and last tokens of the answer. - -36 -00:02:25,120 --> 00:02:28,670 -We can check it works properly on our previous -example. - -37 -00:02:28,670 --> 00:02:35,319 -Putting it all together looks like this big -function, which we can apply to our datasets. - -38 -00:02:35,319 --> 00:02:40,010 -Since we applied padding during the tokenization, -we can then use this directly in the Trainer - -39 -00:02:40,010 --> 00:02:43,920 -or apply the to_tf_dataset method to use Keras.fit. +1 +00:00:05,580 --> 00:00:07,177 +- Let's study how to preprocess a dataset + +2 +00:00:07,177 --> 00:00:08,643 +for question answering. + +3 +00:00:10,200 --> 00:00:11,640 +Question answering is a task + +4 +00:00:11,640 --> 00:00:14,343 +of finding answers to a +question in some context. + +5 +00:00:15,270 --> 00:00:17,550 +For example, we'll use the SQuAD dataset + +6 +00:00:17,550 --> 00:00:19,860 +in which we remove columns we won't use + +7 +00:00:19,860 --> 00:00:21,660 +and just extract the +information we will need + +8 +00:00:21,660 --> 00:00:22,950 +for the labels, + +9 +00:00:22,950 --> 00:00:26,370 +the start and the end of +the answer in the context. + +10 +00:00:26,370 --> 00:00:28,690 +If you have your own dataset +for question answering, + +11 +00:00:28,690 --> 00:00:31,680 +just make sure you clean your +data to get to the same point, + +12 +00:00:31,680 --> 00:00:33,900 +with one column containing the questions, + +13 +00:00:33,900 --> 00:00:35,940 +one column containing the context, + +14 +00:00:35,940 --> 00:00:38,610 +one column for the index of +the start and end character + +15 +00:00:38,610 --> 00:00:40,473 +of the answer in the context. + +16 +00:00:41,610 --> 00:00:44,520 +Note that the answer must +be part of the context. + +17 +00:00:44,520 --> 00:00:47,160 +If you want to perform +generative question answering, + +18 +00:00:47,160 --> 00:00:50,160 +look at one of the sequence to +sequence videos linked below. + +19 +00:00:51,600 --> 00:00:53,430 +Now, if we have a look at the tokens + +20 +00:00:53,430 --> 00:00:54,750 +we will feed our model, + +21 +00:00:54,750 --> 00:00:58,320 +we'll see the answer lies +somewhere inside the context. + +22 +00:00:58,320 --> 00:01:01,080 +For very long context, that +answer may get truncated + +23 +00:01:01,080 --> 00:01:02,580 +by the tokenizer. + +24 +00:01:02,580 --> 00:01:05,970 +In this case, we won't have any +proper labels for our model, + +25 +00:01:05,970 --> 00:01:07,680 +so we should keep the truncated part + +26 +00:01:07,680 --> 00:01:10,203 +as a separate feature +instead of discarding it. + +27 +00:01:11,100 --> 00:01:12,990 +The only thing we need to be careful with + +28 +00:01:12,990 --> 00:01:15,660 +is to allow some overlap +between separate chunks + +29 +00:01:15,660 --> 00:01:17,670 +so that the answer is not truncated + +30 +00:01:17,670 --> 00:01:19,920 +and that the feature containing the answer + +31 +00:01:19,920 --> 00:01:22,623 +gets sufficient context +to be able to predict it. + +32 +00:01:23,490 --> 00:01:26,040 +Here is how it can be +done by the tokenizer. + +33 +00:01:26,040 --> 00:01:29,370 +We pass it the question, +context, set a truncation + +34 +00:01:29,370 --> 00:01:33,240 +for the context only, and the +padding to the maximum length. + +35 +00:01:33,240 --> 00:01:35,340 +The stride argument is +where we set the number + +36 +00:01:35,340 --> 00:01:36,900 +of overlapping tokens, + +37 +00:01:36,900 --> 00:01:39,600 +and the return overflowing +tokens equals true + +38 +00:01:39,600 --> 00:01:42,630 +means we don't want to +discard the truncated part. + +39 +00:01:42,630 --> 00:01:45,210 +Lastly, we also return the offset mappings + +40 +00:01:45,210 --> 00:01:47,220 +to be able to find the +tokens corresponding + +41 +00:01:47,220 --> 00:01:48,693 +to the answer start and end. + +42 +00:01:49,860 --> 00:01:52,290 +We want those tokens because +they will be the labels + +43 +00:01:52,290 --> 00:01:53,970 +we pass through our model. + +44 +00:01:53,970 --> 00:01:56,870 +In a one-hot encoded version, +here is what they look like. + +45 +00:01:57,930 --> 00:02:00,480 +If the context we have does +not contain the answer, + +46 +00:02:00,480 --> 00:02:03,799 +we set the two labels to +the index of the CLS token. + +47 +00:02:03,799 --> 00:02:05,700 +We also do this if the context + +48 +00:02:05,700 --> 00:02:07,713 +only partially contains the answer. + +49 +00:02:08,580 --> 00:02:11,400 +In terms of code, here +is how we can do it. + +50 +00:02:11,400 --> 00:02:13,710 +Using the sequence IDs of an input, + +51 +00:02:13,710 --> 00:02:17,220 +we can determine the beginning +and the end of the context. + +52 +00:02:17,220 --> 00:02:19,800 +Then, we know if we have to +return to the CLS position + +53 +00:02:19,800 --> 00:02:22,290 +for the two labels or we +determine the position + +54 +00:02:22,290 --> 00:02:25,050 +of the first and last +tokens of the answer. + +55 +00:02:25,050 --> 00:02:27,800 +We can check it works properly +on our previous example. + +56 +00:02:28,680 --> 00:02:31,380 +Putting it all together +looks like this big function, + +57 +00:02:31,380 --> 00:02:34,233 +which we can apply to our +datasets with the map method. + +58 +00:02:35,310 --> 00:02:37,920 +Since we applied padding +during the tokenization, + +59 +00:02:37,920 --> 00:02:40,680 +we can then use this +directly as the trainer + +60 +00:02:40,680 --> 00:02:44,133 +or apply the to_tf_dataset +method to use Keras.fit. + diff --git a/subtitles/en/66_the-post-processing-step-in-question-answering-(pytorch).srt b/subtitles/en/66_the-post-processing-step-in-question-answering-(pytorch).srt index be70d0f01..d4a6fd6db 100644 --- a/subtitles/en/66_the-post-processing-step-in-question-answering-(pytorch).srt +++ b/subtitles/en/66_the-post-processing-step-in-question-answering-(pytorch).srt @@ -1,169 +1,342 @@ -1 -00:00:05,680 --> 00:00:12,000 -The post-processing step in a question  -answering task. When doing question answering,   - -2 -00:00:12,000 --> 00:00:17,440 -the processing of the initial dataset implies  -splitting examples in several features, which   - -3 -00:00:17,440 --> 00:00:23,760 -may or may not contain the answer. Passing those  -features through the model will give us logits for   - -4 -00:00:23,760 --> 00:00:29,280 -the start and end positions, since our labels are  -the indices of the tokens that correspond to the   - -5 -00:00:29,280 --> 00:00:35,600 -start and end the answer. We must then somehow  -convert those logits into an answer, and then   - -6 -00:00:35,600 --> 00:00:40,480 -pick one of the various answers each feature  -gives to be THE answer for a given example.   - -7 -00:00:42,080 --> 00:00:46,080 -For the processing step, you should refer  -to the video linked below. It's not very   - -8 -00:00:46,080 --> 00:00:50,240 -different for validation, we just need to  -add a few lines to keep track of two things:   - -9 -00:00:51,440 --> 00:00:56,000 -instead of discarding the offset mapping,  -we keep them, and also include in them the   - -10 -00:00:56,000 --> 00:01:01,440 -information of where the context is by setting  -the offsets of the special tokens and the question   - -11 -00:01:01,440 --> 00:01:06,400 -to None. Then we also keep track  -of the example ID for each feature,   - -12 -00:01:06,400 --> 00:01:10,160 -to be able to map back feature to the  -examples that they originated from.   - -13 -00:01:11,680 --> 00:01:15,840 -If you don't want to compute the validation loss,  -you won't need to include all the special code   - -14 -00:01:15,840 --> 00:01:21,360 -that we used to create the labels. With this done,  -we can apply that preprocessing function using the   - -15 -00:01:21,360 --> 00:01:26,160 -map method. We take the SQUAD dataset like in  -the preprocessing for question-answering video.   - -16 -00:01:27,520 --> 00:01:31,920 -Once this is done, the next step is to create  -our model. We use the default model behind the   - -17 -00:01:31,920 --> 00:01:36,000 -question-answering pipeline here, but you  -should use any model you want to evaluate.   - -18 -00:01:36,720 --> 00:01:41,200 -We will run a manual evaluation loop, so we  -create a PyTorch DataLoader with our features.   - -19 -00:01:42,240 --> 00:01:46,400 -With it, we can compute and gather all  -the start and end logits like this,   - -20 -00:01:46,400 --> 00:01:52,240 -with a standard PyTorch evaluation loop. With this  -done, we can really dive into the post-processing.   - -21 -00:01:53,680 --> 00:01:57,440 -We will need a map from examples to  -features, which we can create like this.   - -22 -00:01:58,560 --> 00:02:02,720 -Now, for the main part of the post-processing,  -let's see how to extract an answer from the   - -23 -00:02:02,720 --> 00:02:08,480 -logits. We could just take the best index for the  -start and end logits and be done, but if our model   - -24 -00:02:08,480 --> 00:02:13,200 -predicts something impossible, like tokens in  -the question, we will look at more of the logits.   - -25 -00:02:15,040 --> 00:02:19,120 -Note that in the question-answering pipeline,  -we attributed score to each answer based on the   - -26 -00:02:19,120 --> 00:02:24,560 -probabilities, which we did not compute here.  -In terms of logits, the multiplication we had   - -27 -00:02:24,560 --> 00:02:31,520 -in the scores becomes an addition. To go fast, we  -don't look at all possible start and end logits,   - -28 -00:02:31,520 --> 00:02:37,120 -but the twenty best ones. We ignore the logits  -that spawn impossible answers or answer that are   - -29 -00:02:37,120 --> 00:02:43,040 -too long. As we saw in the preprocessing,  -the labels (0, 0) correspond to no answer,   - -30 -00:02:43,040 --> 00:02:48,400 -otherwise we use the offsets to get the answer  -inside the context. Let's have a look at the   - -31 -00:02:48,400 --> 00:02:52,640 -predicted answer for the first feature, which  -is the answer with the best score (or the best   - -32 -00:02:52,640 --> 00:02:58,720 -logit score since the SoftMax is an increasing  -function). The model got it right! Next we just   - -33 -00:02:58,720 --> 00:03:03,920 -have to loop this for every example, picking for  -each the answer with the best logit score in all   - -34 -00:03:03,920 --> 00:03:15,440 -the features the example generated. Now you know  -how to get answers from your model predictions! +1 +00:00:00,315 --> 00:00:02,982 +(air whooshing) + +2 +00:00:05,940 --> 00:00:08,913 +- The post-processing step +in a question answering task. + +3 +00:00:10,440 --> 00:00:12,180 +When doing question answering, + +4 +00:00:12,180 --> 00:00:14,550 +the processing of the initial dataset + +5 +00:00:14,550 --> 00:00:17,370 +implies splitting examples +in several features, + +6 +00:00:17,370 --> 00:00:19,773 +which may or may not contain the answer. + +7 +00:00:21,000 --> 00:00:22,740 +Passing those features through the model + +8 +00:00:22,740 --> 00:00:25,830 +will give us logits for the +start and end positions, + +9 +00:00:25,830 --> 00:00:28,650 +since our labels are +the indices of the token + +10 +00:00:28,650 --> 00:00:31,050 +that correspond to the +start and end the answer. + +11 +00:00:32,664 --> 00:00:35,490 +We must then somehow convert +those logits into an answer, + +12 +00:00:35,490 --> 00:00:38,610 +and then pick one of the various +answers each feature gives + +13 +00:00:38,610 --> 00:00:40,893 +to be the answer for a given example. + +14 +00:00:42,300 --> 00:00:43,500 +For the processing step, + +15 +00:00:43,500 --> 00:00:45,750 +you should refer to +the video linked below. + +16 +00:00:45,750 --> 00:00:47,820 +It's not very different for validation, + +17 +00:00:47,820 --> 00:00:50,820 +we just need to add a few lines +to keep track of two things. + +18 +00:00:51,660 --> 00:00:54,960 +Instead of discarding the +offset mappings, we keep them, + +19 +00:00:54,960 --> 00:00:55,793 +and also include in them + +20 +00:00:55,793 --> 00:00:58,350 +the information of where the context is + +21 +00:00:58,350 --> 00:01:00,690 +by setting the offsets +of the special tokens + +22 +00:01:00,690 --> 00:01:02,253 +and the question to None. + +23 +00:01:03,480 --> 00:01:06,630 +Then we also keep track of the +example ID for each feature, + +24 +00:01:06,630 --> 00:01:08,280 +to be able to map back feature + +25 +00:01:08,280 --> 00:01:10,503 +to the examples that they originated from. + +26 +00:01:11,940 --> 00:01:14,100 +If you don't want to +compute the validation loss, + +27 +00:01:14,100 --> 00:01:15,990 +you won't need to include +all the special code + +28 +00:01:15,990 --> 00:01:18,420 +that we used to create the labels. + +29 +00:01:18,420 --> 00:01:21,090 +With this done, we can apply +that preprocessing function + +30 +00:01:21,090 --> 00:01:22,890 +using the map method. + +31 +00:01:22,890 --> 00:01:24,090 +We take the SQUAD dataset + +32 +00:01:24,090 --> 00:01:26,840 +like in the preprocessing +for question-answering video. + +33 +00:01:27,810 --> 00:01:30,540 +Once this is done, the next +step is to create our model. + +34 +00:01:30,540 --> 00:01:31,710 +We use the default model + +35 +00:01:31,710 --> 00:01:33,930 +behind the question-answering +pipeline here, + +36 +00:01:33,930 --> 00:01:36,960 +but you should use any +model you want to evaluate. + +37 +00:01:36,960 --> 00:01:38,850 +We'll run a manual evaluation loop, + +38 +00:01:38,850 --> 00:01:41,583 +so we create a PyTorch +DataLoader with our features. + +39 +00:01:42,657 --> 00:01:44,520 +With it, we can compute and gather + +40 +00:01:44,520 --> 00:01:46,650 +all the start and end logits like this, + +41 +00:01:46,650 --> 00:01:49,653 +with a standard PyTorch evaluation loop. + +42 +00:01:49,653 --> 00:01:53,220 +With this done, we can really +dive into the post-processing. + +43 +00:01:53,220 --> 00:01:56,340 +First, we'll need a map +from example to features, + +44 +00:01:56,340 --> 00:01:57,873 +which we can create like this. + +45 +00:01:58,800 --> 00:02:00,810 +Now, for the main part +of the post-processing, + +46 +00:02:00,810 --> 00:02:04,230 +let's see how to extract +an answer from the logits. + +47 +00:02:04,230 --> 00:02:05,760 +We could just take the best index + +48 +00:02:05,760 --> 00:02:07,980 +for the start and end logits and be done, + +49 +00:02:07,980 --> 00:02:10,380 +but if our model predicts +something impossible, + +50 +00:02:10,380 --> 00:02:12,150 +like tokens in the question, + +51 +00:02:12,150 --> 00:02:13,940 +we'll look at more of the logits. + +52 +00:02:15,270 --> 00:02:17,070 +Note that in the +question-answering pipeline, + +53 +00:02:17,070 --> 00:02:18,870 +we attributed score to each answer + +54 +00:02:18,870 --> 00:02:20,430 +based on the probabilities, + +55 +00:02:20,430 --> 00:02:22,350 +which we did not compute here. + +56 +00:02:22,350 --> 00:02:25,560 +In terms of logits, the +multiplication we had in the scores + +57 +00:02:25,560 --> 00:02:26,853 +becomes an addition. + +58 +00:02:28,110 --> 00:02:29,010 +To go fast, + +59 +00:02:29,010 --> 00:02:31,800 +we don't look at all possible +start and end logits, + +60 +00:02:31,800 --> 00:02:34,050 +but the 20 best one is enough. + +61 +00:02:34,050 --> 00:02:36,570 +We ignore the logits that +spawn impossible answers + +62 +00:02:36,570 --> 00:02:38,550 +or answer that are too long. + +63 +00:02:38,550 --> 00:02:41,430 +As we saw in the +preprocessing, the labels 0,0 + +64 +00:02:41,430 --> 00:02:43,230 +correspond to a no answer. + +65 +00:02:43,230 --> 00:02:45,090 +Otherwise we use the offsets + +66 +00:02:45,090 --> 00:02:46,940 +to get the answer inside the context. + +67 +00:02:47,910 --> 00:02:49,107 +Let's have a look at the predicted answer + +68 +00:02:49,107 --> 00:02:50,370 +for the first feature, + +69 +00:02:50,370 --> 00:02:51,930 +which is the answer with the best score + +70 +00:02:51,930 --> 00:02:53,640 +or the best logit score + +71 +00:02:53,640 --> 00:02:56,280 +since the SoftMax is +an increasing function. + +72 +00:02:56,280 --> 00:02:58,230 +The model got it right. + +73 +00:02:58,230 --> 00:03:00,690 +Next we just have to loop +this for every example, + +74 +00:03:00,690 --> 00:03:03,720 +picking for each the answer +with the best logit score + +75 +00:03:03,720 --> 00:03:06,750 +in all the features the example generated. + +76 +00:03:06,750 --> 00:03:09,700 +Now you know how to get answers +from your model prediction. + +77 +00:03:11,007 --> 00:03:13,674 +(air whooshing) + diff --git a/subtitles/en/67_the-post-processing-step-in-question-answering-(tensorflow).srt b/subtitles/en/67_the-post-processing-step-in-question-answering-(tensorflow).srt index d29515a59..59c0957ce 100644 --- a/subtitles/en/67_the-post-processing-step-in-question-answering-(tensorflow).srt +++ b/subtitles/en/67_the-post-processing-step-in-question-answering-(tensorflow).srt @@ -1,169 +1,329 @@ -1 -00:00:05,760 --> 00:00:08,560 -The post-processing step in  -a question answering task.   - -2 -00:00:10,640 --> 00:00:14,640 -When doing question answering, the  -processing of the initial dataset   - -3 -00:00:14,640 --> 00:00:20,960 -implies splitting examples in several features,  -which may or may not contain the answer. Passing   - -4 -00:00:20,960 --> 00:00:25,680 -those features through the model will give  -us logits for the start and end positions,   - -5 -00:00:25,680 --> 00:00:30,640 -since our labels are the indices of the tokens  -that correspond to the start and end the answer.   - -6 -00:00:31,600 --> 00:00:36,560 -We must then somehow convert those logits into an  -answer, and then pick one of the various answers   - -7 -00:00:36,560 --> 00:00:43,280 -each feature gives to be THE answer for a given  -example. For the processing step, you should   - -8 -00:00:43,280 --> 00:00:47,840 -refer to the video linked below. It's not very  -different for validation, we just need to add a   - -9 -00:00:47,840 --> 00:00:53,520 -few lines to keep track of two things: instead  -of discarding the offset mapping, we keep them,   - -10 -00:00:53,520 --> 00:00:58,240 -and also include in them the information of  -where the context is by setting the offsets   - -11 -00:00:58,240 --> 00:01:04,240 -of the special tokens and the question to None.  -Then we also keep track of the example ID for   - -12 -00:01:04,240 --> 00:01:08,880 -each feature, to be able to map back feature  -to the examples that they originated from.   - -13 -00:01:10,240 --> 00:01:14,400 -If you don't want to compute the validation loss,  -you won't need to include all the special code   - -14 -00:01:14,400 --> 00:01:19,840 -that we used to create the labels. With this done,  -we can apply that preprocessing function using the   - -15 -00:01:19,840 --> 00:01:26,160 -map method. We take the SQUAD dataset like in  -the preprocessing for question-answering video.   - -16 -00:01:26,160 --> 00:01:30,560 -Once this is done, the next step is to create  -our model. We use the default model behind the   - -17 -00:01:30,560 --> 00:01:34,560 -question-answering pipeline here, but you  -should use any model you want to evaluate.   - -18 -00:01:35,600 --> 00:01:40,560 -With the to_tf_dataset method, we can just  -sent our processed dataset to model.predict,   - -19 -00:01:41,120 --> 00:01:44,880 -and we directly get our start and end logits  -for the whole dataset as NumPy arrays.   - -20 -00:01:45,600 --> 00:01:51,040 -With this done, we can really dive into the  -post-processing. We will need a map from examples   - -21 -00:01:51,040 --> 00:01:57,040 -to features, which we can create like this. Now,  -for the main part of the post-processing, let's   - -22 -00:01:57,040 --> 00:02:02,080 -see how to extract an answer from the logits. We  -could just take the best index for the start and   - -23 -00:02:02,080 --> 00:02:07,680 -end logits and be done, but if our model predicts  -something impossible, like tokens in the question,   - -24 -00:02:07,680 --> 00:02:13,040 -we will look at more of the logits. Note that in  -the question-answering pipeline, we attributed   - -25 -00:02:13,040 --> 00:02:18,560 -score to each answer based on the probabilities,  -which we did not compute here. In terms of logits,   - -26 -00:02:18,560 --> 00:02:24,080 -the multiplication we had in the scores becomes  -an addition. To go fast, we don't look at all   - -27 -00:02:24,080 --> 00:02:29,040 -possible start and end logits, but the twenty  -best ones. We ignore the logits that spawn   - -28 -00:02:29,040 --> 00:02:34,240 -impossible answers or answer that are too long.  -As we saw in the preprocessing, the labels (0,   - -29 -00:02:34,240 --> 00:02:38,880 -0) correspond to no answer, otherwise we use the  -offsets to get the answer inside the context.   - -30 -00:02:39,920 --> 00:02:43,760 -Let's have a look at the predicted answer  -for the first feature, which is the answer   - -31 -00:02:43,760 --> 00:02:47,680 -with the best score (or the best logit score  -since the SoftMax is an increasing function).   - -32 -00:02:48,480 --> 00:02:54,000 -The model got it right! Next we just  -have to loop this for every example,   - -33 -00:02:54,000 --> 00:02:58,880 -picking for each the answer with the best logit  -score in all the features the example generated.   - -34 -00:02:59,840 --> 00:03:03,840 -Now you know how to get answers  -from your model predictions! +1 +00:00:00,367 --> 00:00:02,950 +(subtle blast) + +2 +00:00:05,850 --> 00:00:08,913 +- The post-processing step +in a question-answering task. + +3 +00:00:10,830 --> 00:00:11,790 +When doing question answering, + +4 +00:00:11,790 --> 00:00:14,670 +the processing of the initial dataset + +5 +00:00:14,670 --> 00:00:18,090 +implies splitting examples +in several features, + +6 +00:00:18,090 --> 00:00:20,850 +which may or may not contain the answer. + +7 +00:00:20,850 --> 00:00:22,530 +Passing those features through the model + +8 +00:00:22,530 --> 00:00:25,860 +will give us logits for the +start and end positions, + +9 +00:00:25,860 --> 00:00:28,620 +since our labels are the +indices of the tokens + +10 +00:00:28,620 --> 00:00:31,020 +that correspond to the +start and end the answer. + +11 +00:00:31,860 --> 00:00:34,740 +We must then somehow convert +those logits into an answer, + +12 +00:00:34,740 --> 00:00:38,070 +and then pick one of the various +answers each feature gives + +13 +00:00:38,070 --> 00:00:40,473 +to be the answer for a given example. + +14 +00:00:41,683 --> 00:00:43,200 +For the processing step, + +15 +00:00:43,200 --> 00:00:45,450 +you should refer to +the video linked below. + +16 +00:00:45,450 --> 00:00:47,310 +It's not very different for validation, + +17 +00:00:47,310 --> 00:00:50,053 +we just need to add a few lines +to keep track of two things: + +18 +00:00:50,053 --> 00:00:52,620 +instead of discarding the offset mappings, + +19 +00:00:52,620 --> 00:00:55,380 +we keep them, and also include +in them the information + +20 +00:00:55,380 --> 00:00:58,410 +of where the context is +by setting the offsets + +21 +00:00:58,410 --> 00:01:01,821 +of the special tokens +and the question to None. + +22 +00:01:01,821 --> 00:01:05,370 +Then we also keep track of the +example ID for each feature, + +23 +00:01:05,370 --> 00:01:07,020 +to be able to map back feature + +24 +00:01:07,020 --> 00:01:09,243 +to the examples that they originated from. + +25 +00:01:10,470 --> 00:01:12,660 +If you don't want to +compute the validation loss, + +26 +00:01:12,660 --> 00:01:14,610 +you won't need to include +all the special code + +27 +00:01:14,610 --> 00:01:17,010 +that we used to create the labels. + +28 +00:01:17,010 --> 00:01:19,650 +With this done, we can apply +that preprocessing function + +29 +00:01:19,650 --> 00:01:21,480 +using the map method. + +30 +00:01:21,480 --> 00:01:23,610 +We take the SQUAD dataset +like in the preprocessing + +31 +00:01:23,610 --> 00:01:25,060 +for question-answering video. + +32 +00:01:26,400 --> 00:01:29,310 +Once this is done, the next +step is to create our model. + +33 +00:01:29,310 --> 00:01:30,570 +We use the default model behind + +34 +00:01:30,570 --> 00:01:32,640 +the question-answering pipeline here, + +35 +00:01:32,640 --> 00:01:35,880 +but you should use any +model you want to evaluate. + +36 +00:01:35,880 --> 00:01:37,680 +With the to_tf_dataset method, + +37 +00:01:37,680 --> 00:01:41,370 +we can just sent our processed +dataset to model.predict, + +38 +00:01:41,370 --> 00:01:43,350 +and we directly get our +start and end logits + +39 +00:01:43,350 --> 00:01:45,930 +for the whole dataset as NumPy arrays. + +40 +00:01:45,930 --> 00:01:49,230 +With this done, we can really +dive into the post-processing. + +41 +00:01:49,230 --> 00:01:52,380 +First, we'll need a map +from example to features, + +42 +00:01:52,380 --> 00:01:53,883 +which we can create like this. + +43 +00:01:54,780 --> 00:01:56,700 +Now, for the main part +of the post-processing, + +44 +00:01:56,700 --> 00:02:00,270 +let's see how to extract +an answer from the logits. + +45 +00:02:00,270 --> 00:02:01,650 +We could just take the best index + +46 +00:02:01,650 --> 00:02:03,690 +for the start and end logits and be done, + +47 +00:02:03,690 --> 00:02:06,180 +but if our model predicts +something impossible, + +48 +00:02:06,180 --> 00:02:07,920 +like tokens in the questions, + +49 +00:02:07,920 --> 00:02:09,670 +we will look at more of the logits. + +50 +00:02:10,800 --> 00:02:12,570 +Note that in the +question-answering pipeline, + +51 +00:02:12,570 --> 00:02:14,160 +we attributed the score to each answer + +52 +00:02:14,160 --> 00:02:17,880 +based on the probabilities, +which we did not compute here. + +53 +00:02:17,880 --> 00:02:19,860 +In terms of logits, the +multiplication we had + +54 +00:02:19,860 --> 00:02:21,663 +in the scores becomes an addition. + +55 +00:02:22,650 --> 00:02:23,910 +To go fast, we don't look + +56 +00:02:23,910 --> 00:02:25,343 +at all possible start and end logits, + +57 +00:02:25,343 --> 00:02:26,973 +but the 20 best ones. + +58 +00:02:27,810 --> 00:02:30,386 +We ignore the logits that +spawn impossible answers + +59 +00:02:30,386 --> 00:02:32,370 +or answer that are too long. + +60 +00:02:32,370 --> 00:02:33,720 +As we saw in the preprocessing, + +61 +00:02:33,720 --> 00:02:36,240 +the label "0, 0" correspond to no answer, + +62 +00:02:36,240 --> 00:02:37,440 +otherwise we use the offset + +63 +00:02:37,440 --> 00:02:39,290 +to get the answer inside the context. + +64 +00:02:40,260 --> 00:02:41,580 +Let's have a look at the predicted answer + +65 +00:02:41,580 --> 00:02:43,200 +for the first feature, + +66 +00:02:43,200 --> 00:02:44,790 +which is the answer with the best score, + +67 +00:02:44,790 --> 00:02:46,860 +or the best logit score since the SoftMax + +68 +00:02:46,860 --> 00:02:48,810 +is an increasing function. + +69 +00:02:48,810 --> 00:02:49,960 +The model got it right. + +70 +00:02:51,210 --> 00:02:54,180 +Next, we just have to loop +this for every example, + +71 +00:02:54,180 --> 00:02:56,700 +picking for each the answer +with the best logit score + +72 +00:02:56,700 --> 00:02:59,133 +in all the features the example generated. + +73 +00:03:00,030 --> 00:03:03,030 +Now you know how to get answers +from your model predictions. + +74 +00:03:04,214 --> 00:03:06,797 +(subtle blast) + diff --git a/subtitles/en/68_data-collators-a-tour.srt b/subtitles/en/68_data-collators-a-tour.srt index e388c1513..56895ea7e 100644 --- a/subtitles/en/68_data-collators-a-tour.srt +++ b/subtitles/en/68_data-collators-a-tour.srt @@ -1,341 +1,655 @@ -1 -00:00:06,220 --> 00:00:12,290 -In a lot of our examples, you're going to -see DataCollators popping up over and over. - -2 -00:00:12,290 --> 00:00:18,010 -They're used in both PyTorch and TensorFlow -workflows, and maybe even in JAX, but no-one - -3 -00:00:18,010 --> 00:00:20,260 -really knows what's happening in JAX. - -4 -00:00:20,260 --> 00:00:24,590 -We have a research team working on that, so -maybe they'll tell us soon. - -5 -00:00:24,590 --> 00:00:27,869 -But what are data collators? - -6 -00:00:27,869 --> 00:00:32,230 -Data collators collate data. - -7 -00:00:32,230 --> 00:00:37,930 -More specifically, they put together a list -of samples into a single training minibatch. - -8 -00:00:37,930 --> 00:00:41,820 -For some tasks, the data collator can be very -straightforward. - -9 -00:00:41,820 --> 00:00:47,010 -For example, when you're doing sequence classification, -all you really need from your data collator - -10 -00:00:47,010 --> 00:00:53,480 -is that it pads your samples to the same length -and concatenates them into a single Tensor. - -11 -00:00:53,480 --> 00:00:58,989 -But for other workflows, data collators can -be more complex, as they handle some of the - -12 -00:00:58,989 --> 00:01:04,879 -preprocessing needed for that particular task. - -13 -00:01:04,879 --> 00:01:09,600 -For PyTorch users, you usually pass the DataCollator -to your Trainer object. - -14 -00:01:09,600 --> 00:01:15,549 -In TensorFlow, the easiest way to use a DataCollator -is to pass it to the to_tf_dataset method - -15 -00:01:15,549 --> 00:01:23,700 -of your dataset. - -16 -00:01:23,700 --> 00:01:27,420 -You'll see these approaches used in the examples -and notebooks throughout this course. - -17 -00:01:27,420 --> 00:01:28,820 -In both cases, you end up with an iterable -that's going to output collated batches, ready - -18 -00:01:28,820 --> 00:01:29,820 -for training. - -19 -00:01:29,820 --> 00:01:34,360 -Note that all of our collators take a return_tensors -argument - you can set this to "pt" to get - -20 -00:01:34,360 --> 00:01:40,820 -PyTorch Tensors, "tf" to get TensorFlow Tensors, -or "np" to get Numpy arrays. - -21 -00:01:40,820 --> 00:01:46,060 -For backward compatibility reasons, the default -value is "pt", so PyTorch users don't even - -22 -00:01:46,060 --> 00:01:51,110 -have to set this argument most of the time, -and so are often totally unaware that this - -23 -00:01:51,110 --> 00:01:52,110 -option exists. - -24 -00:01:52,110 --> 00:01:59,160 -This is a valuable lesson about how the beneficiaries -of privilege are often the most blind to its - -25 -00:01:59,160 --> 00:02:00,160 -existence. - -26 -00:02:00,160 --> 00:02:08,130 -So now let's see some specific DataCollators -in action, though remember that if none of - -27 -00:02:08,130 --> 00:02:12,069 -them do what you need, you can always write -your own! - -28 -00:02:12,069 --> 00:02:17,120 -First, we'll see the "basic" data collators. - -29 -00:02:17,120 --> 00:02:21,550 -These are DefaultDataCollator and DataCollatorWithPadding. - -30 -00:02:21,550 --> 00:02:25,550 -These are the ones you should use if your -labels are straightforward and your data doesn't - -31 -00:02:25,550 --> 00:02:28,780 -need any special processing before being ready -for training. - -32 -00:02:28,780 --> 00:02:30,100 -Most sequence classification tasks, for example, -would use one of these data collators. - -33 -00:02:30,100 --> 00:02:35,470 -Remember that because different models have -different padding tokens, DataCollatorWithPadding - -34 -00:02:35,470 --> 00:02:39,239 -will need your model's Tokenizer so it knows -how to pad sequences properly! - -35 -00:02:39,239 --> 00:02:41,069 -So how do you choose one of these? - -36 -00:02:41,069 --> 00:02:44,970 -Simple: As you can see here, if you have variable -sequence lengths then you should use DataCollatorWithPadding, - -37 -00:02:44,970 --> 00:02:46,690 -which will pad all your sequences to the same -length. - -38 -00:02:46,690 --> 00:02:49,819 -If you're sure all your sequences are the -same length then you can use the even simpler - -39 -00:02:49,819 --> 00:02:51,510 -DefaultDataCollator, but it'll give you an -error if that assumption is wrong! - -40 -00:02:51,510 --> 00:02:57,720 -Moving on, though, many of the other data -collators are often designed to handle one - -41 -00:02:57,720 --> 00:03:07,411 -specific task, and that's the case with DataCollatorForTokenClassification -and DataCollatorForSeqToSeq. - -42 -00:03:07,411 --> 00:03:12,830 -These tasks need special collators because -the labels are variable in length. - -43 -00:03:12,830 --> 00:03:16,580 -In token classification there's one label -for each token, and that means the length - -44 -00:03:16,580 --> 00:03:23,599 -of the labels can be variable, while in SeqToSeq -the labels are also a sequence of tokens that - -45 -00:03:23,599 --> 00:03:28,470 -can have variable length. - -46 -00:03:28,470 --> 00:03:38,580 -In both of these cases, we handle that by -padding the labels too, as you can see here. - -47 -00:03:38,580 --> 00:03:43,810 -Inputs and the labels will need to be padded -if we want to join samples of variable length - -48 -00:03:43,810 --> 00:03:50,440 -into the same minibatch, and that's exactly -what the data collators will do. - -49 -00:03:50,440 --> 00:04:01,680 -The final data collator I want to show you -is the DataCollatorForLanguageModeling. - -50 -00:04:01,680 --> 00:04:07,470 -It's very important, firstly because language -models are so foundational to everything we - -51 -00:04:07,470 --> 00:04:15,030 -do in NLP, and secondly because it has two -modes that do two very different things. - -52 -00:04:15,030 --> 00:04:21,889 -You choose which mode you want with the mlm -argument - set it to True for masked language - -53 -00:04:21,889 --> 00:04:26,729 -modeling, and False for causal language modeling. - -54 -00:04:26,729 --> 00:04:31,110 -Collating data for causal language modeling -is actually quite straightforward - the model - -55 -00:04:31,110 --> 00:04:35,962 -is just making predictions for what token -comes next, so your labels are more or less - -56 -00:04:35,962 --> 00:04:40,530 -just a copy of your inputs, and the collator -handles that and ensures your inputs and labels - -57 -00:04:40,530 --> 00:04:42,380 -are padded correctly. - -58 -00:04:42,380 --> 00:04:49,500 -When you set mlm to True, though, you get -quite different behaviour! - -59 -00:04:49,500 --> 00:04:58,250 -That's because masked language modeling requires -the labels to be, well... masked. - -60 -00:04:58,250 --> 00:05:01,539 -So what does that look like? - -61 -00:05:01,539 --> 00:05:06,860 -Recall that in masked language modeling, the -model is not predicting "the next word"; instead - -62 -00:05:06,860 --> 00:05:11,590 -we randomly mask out multiple tokens and the -model makes predictions for all of them at - -63 -00:05:11,590 --> 00:05:12,590 -once. - -64 -00:05:12,590 --> 00:05:18,729 -The process of random masking is surprisingly -complex, though - that's because if we follow - -65 -00:05:18,729 --> 00:05:23,770 -the protocol from the original BERT paper, -we need to replace some tokens with a masking - -66 -00:05:23,770 --> 00:05:30,080 -token, other tokens with a random token and -then keep a third set of tokens unchanged. - -67 -00:05:30,080 --> 00:05:35,919 -This isn't the lecture to go into *why* we -do that - you should check out the original - -68 -00:05:35,919 --> 00:05:40,720 -BERT paper if you're curious. - -69 -00:05:40,720 --> 00:05:46,949 -The main thing to know here is that it can -be a real pain to implement yourself, but - -70 -00:05:46,949 --> 00:05:53,300 -DataCollatorForLanguageModeling will do it -for you. - -71 -00:05:53,300 --> 00:05:57,800 -And that's it! - -72 -00:05:57,800 --> 00:06:15,410 -That covers the most commonly used data collators -and the tasks they're used for. +1 +00:00:00,670 --> 00:00:01,503 +(whooshing sound) + +2 +00:00:01,503 --> 00:00:02,469 +(sticker popping) + +3 +00:00:02,469 --> 00:00:05,302 +(whooshing sound) + +4 +00:00:06,240 --> 00:00:08,220 +In a lot of our examples, + +5 +00:00:08,220 --> 00:00:12,150 +you're going to see DataCollators +popping up over and over. + +6 +00:00:12,150 --> 00:00:16,020 +They're used in both PyTorch +and TensorFlow workflows, + +7 +00:00:16,020 --> 00:00:17,460 +and maybe even in JAX, + +8 +00:00:17,460 --> 00:00:20,130 +but no-one really knows +what's happening in JAX. + +9 +00:00:20,130 --> 00:00:21,840 +We do have a research +team working on it though, + +10 +00:00:21,840 --> 00:00:23,970 +so maybe they'll tell us soon. + +11 +00:00:23,970 --> 00:00:25,620 +But coming back on topic. + +12 +00:00:25,620 --> 00:00:27,600 +What are data collators? + +13 +00:00:27,600 --> 00:00:30,480 +Data collators collate data. + +14 +00:00:30,480 --> 00:00:31,800 +That's not that helpful. + +15 +00:00:31,800 --> 00:00:35,023 +But to be more specific, they +put together a list of samples + +16 +00:00:35,023 --> 00:00:37,830 +into a single training minibatch. + +17 +00:00:37,830 --> 00:00:38,910 +For some tasks, + +18 +00:00:38,910 --> 00:00:41,670 +the data collator can +be very straightforward. + +19 +00:00:41,670 --> 00:00:44,820 +For example, when you're +doing sequence classification, + +20 +00:00:44,820 --> 00:00:47,010 +all you really need +from your data collator + +21 +00:00:47,010 --> 00:00:49,860 +is that it pads your +samples to the same length + +22 +00:00:49,860 --> 00:00:52,413 +and concatenates them +into a single Tensor. + +23 +00:00:53,340 --> 00:00:57,750 +But for other workflows, data +collators can be quite complex + +24 +00:00:57,750 --> 00:00:59,910 +as they handle some of the preprocessing + +25 +00:00:59,910 --> 00:01:02,340 +needed for that particular task. + +26 +00:01:02,340 --> 00:01:04,800 +So, if you want to use a data collator, + +27 +00:01:04,800 --> 00:01:07,860 +for PyTorch users, you +usually pass the data collator + +28 +00:01:07,860 --> 00:01:09,780 +to your Trainer object. + +29 +00:01:09,780 --> 00:01:11,310 +In TensorFlow, it's a bit different. + +30 +00:01:11,310 --> 00:01:12,960 +The easiest way to use a data collator + +31 +00:01:12,960 --> 00:01:16,860 +is to pass it to the to_tf_dataset +method of your dataset. + +32 +00:01:16,860 --> 00:01:20,198 +And this will give you a +tensorflow_tf_data.dataset + +33 +00:01:20,198 --> 00:01:22,743 +that you can then pass to model.fit. + +34 +00:01:23,580 --> 00:01:25,890 +You'll see these approaches +used in the examples + +35 +00:01:25,890 --> 00:01:28,068 +and notebooks throughout this course. + +36 +00:01:28,068 --> 00:01:30,180 +Also note that all of our collators + +37 +00:01:30,180 --> 00:01:32,610 +take a return_tensors argument. + +38 +00:01:32,610 --> 00:01:35,737 +You can set this to "pt" +to get PyTorch Tensors, + +39 +00:01:35,737 --> 00:01:37,920 +"tf" to get TensorFlow Tensors, + +40 +00:01:37,920 --> 00:01:40,404 +or "np" to get Numpy arrays. + +41 +00:01:40,404 --> 00:01:42,450 +For backward compatibility reasons, + +42 +00:01:42,450 --> 00:01:44,460 +the default value is "pt", + +43 +00:01:44,460 --> 00:01:47,160 +so PyTorch users don't even +have to set this argument + +44 +00:01:47,160 --> 00:01:48,270 +most of the time. + +45 +00:01:48,270 --> 00:01:50,820 +And so as a result, they're +often totally unaware + +46 +00:01:50,820 --> 00:01:52,713 +that this argument even exists. + +47 +00:01:53,730 --> 00:01:55,050 +We can learn something from this + +48 +00:01:55,050 --> 00:01:57,120 +which is that the +beneficiaries of privilege + +49 +00:01:57,120 --> 00:01:59,793 +are often the most blind to its existence. + +50 +00:02:00,690 --> 00:02:01,920 +But okay, coming back. + +51 +00:02:01,920 --> 00:02:06,540 +Let's see how some specific +data collators work in action. + +52 +00:02:06,540 --> 00:02:08,070 +Although again, remember if none + +53 +00:02:08,070 --> 00:02:09,900 +of the built-in data +collators do what you need, + +54 +00:02:09,900 --> 00:02:13,650 +you can always write your own +and they're often quite short. + +55 +00:02:13,650 --> 00:02:16,950 +So first, we'll see the +"basic" data collators. + +56 +00:02:16,950 --> 00:02:20,433 +These are DefaultDataCollator +and DataCollatorWithPadding. + +57 +00:02:21,420 --> 00:02:22,830 +These are the ones you should use + +58 +00:02:22,830 --> 00:02:24,720 +if your labels are straightforward + +59 +00:02:24,720 --> 00:02:27,300 +and your data doesn't need +any special processing + +60 +00:02:27,300 --> 00:02:29,673 +before being ready for training. + +61 +00:02:29,673 --> 00:02:31,272 +Notice that because different models + +62 +00:02:31,272 --> 00:02:33,690 +have different padding tokens, + +63 +00:02:33,690 --> 00:02:37,170 +DataCollatorWithPadding will +need your model's Tokenizer + +64 +00:02:37,170 --> 00:02:40,150 +so it knows how to pad sequences properly. + +65 +00:02:40,150 --> 00:02:44,790 +The default data collator +doesn't need a Tokenizer to work, + +66 +00:02:44,790 --> 00:02:46,710 +but it will as a result throw an error + +67 +00:02:46,710 --> 00:02:48,900 +unless all of your sequences +are the same length. + +68 +00:02:48,900 --> 00:02:50,500 +So, you should be aware of that. + +69 +00:02:51,480 --> 00:02:52,860 +Moving on though. + +70 +00:02:52,860 --> 00:02:54,300 +A lot of the other data collators + +71 +00:02:54,300 --> 00:02:56,130 +aside from the basic two are, + +72 +00:02:56,130 --> 00:02:59,490 +they're usually designed to +handle one specific task. + +73 +00:02:59,490 --> 00:03:01,050 +And so, I'm going to show a couple here. + +74 +00:03:01,050 --> 00:03:04,320 +These are +DataCollatorForTokenClassification + +75 +00:03:04,320 --> 00:03:06,447 +and DataCollatorForSeqToSeq. + +76 +00:03:06,447 --> 00:03:09,540 +And the reason these tasks +need special collators + +77 +00:03:09,540 --> 00:03:12,600 +is because their labels +are variable in length. + +78 +00:03:12,600 --> 00:03:15,960 +In token classification there's +one label for each token, + +79 +00:03:15,960 --> 00:03:17,400 +and so the length of the labels + +80 +00:03:17,400 --> 00:03:18,993 +is the length of the sequence. + +81 +00:03:20,280 --> 00:03:23,520 +While in SeqToSeq the labels +are a sequence of tokens + +82 +00:03:23,520 --> 00:03:24,780 +that can be variable length, + +83 +00:03:24,780 --> 00:03:25,800 +that can be very different + +84 +00:03:25,800 --> 00:03:28,200 +from the length of the input sequence. + +85 +00:03:28,200 --> 00:03:32,880 +So in both of these cases, we +handle collating that batch + +86 +00:03:32,880 --> 00:03:35,280 +by padding the labels as well, + +87 +00:03:35,280 --> 00:03:37,410 +as you can see here in this example. + +88 +00:03:37,410 --> 00:03:40,770 +So, inputs and the labels +will need to be padded + +89 +00:03:40,770 --> 00:03:43,860 +if we want to join +samples of variable length + +90 +00:03:43,860 --> 00:03:45,120 +into the same minibatch. + +91 +00:03:45,120 --> 00:03:47,520 +That's exactly what the data collators + +92 +00:03:47,520 --> 00:03:50,460 +and that's exactly what these +data collators will do for us + +93 +00:03:50,460 --> 00:03:52,383 +you know, for this particular task. + +94 +00:03:53,820 --> 00:03:56,070 +So, there's one final data collator + +95 +00:03:56,070 --> 00:03:58,560 +I want to show you as +well just in this lecture. + +96 +00:03:58,560 --> 00:04:00,473 +And that's the +DataCollatorForLanguageModeling. + +97 +00:04:01,410 --> 00:04:03,390 +So, it's very important, and it's firstly, + +98 +00:04:03,390 --> 00:04:05,820 +because language models +are just so foundational + +99 +00:04:05,820 --> 00:04:09,720 +to do for everything we +do with NLP these days. + +100 +00:04:09,720 --> 00:04:12,060 +But secondly, because it has two modes + +101 +00:04:12,060 --> 00:04:14,760 +that do two very different things. + +102 +00:04:14,760 --> 00:04:19,230 +So you choose which mode you +want with the mlm argument. + +103 +00:04:19,230 --> 00:04:22,470 +Set it to True for +masked language modeling, + +104 +00:04:22,470 --> 00:04:26,190 +and set it to False for +causal language modeling. + +105 +00:04:26,190 --> 00:04:28,620 +So, collating data for +causal language modeling + +106 +00:04:28,620 --> 00:04:30,750 +is actually quite straightforward. + +107 +00:04:30,750 --> 00:04:32,640 +The model is just making predictions + +108 +00:04:32,640 --> 00:04:35,460 +for what token comes +next, and so your labels + +109 +00:04:35,460 --> 00:04:37,800 +are more or less just +a copy of your inputs, + +110 +00:04:37,800 --> 00:04:39,090 +and the collator will handle that + +111 +00:04:39,090 --> 00:04:42,240 +and ensure that the inputs and +labels are padded correctly. + +112 +00:04:42,240 --> 00:04:44,910 +When you set mlm to True though, + +113 +00:04:44,910 --> 00:04:46,786 +you get quite different behavior, + +114 +00:04:46,786 --> 00:04:49,200 +that's different from +any other data collator, + +115 +00:04:49,200 --> 00:04:51,660 +and that's because setting mlm to True + +116 +00:04:51,660 --> 00:04:53,550 +means masked language modeling + +117 +00:04:53,550 --> 00:04:55,680 +and that means the labels need to be, + +118 +00:04:55,680 --> 00:04:58,080 +you know, the inputs need to be masked. + +119 +00:04:58,080 --> 00:05:00,093 +So, what does that look like? + +120 +00:05:01,050 --> 00:05:03,900 +So, recall that in +masked language modeling, + +121 +00:05:03,900 --> 00:05:06,570 +the model is not predicting the next word, + +122 +00:05:06,570 --> 00:05:09,240 +instead we randomly mask out some tokens + +123 +00:05:09,240 --> 00:05:11,130 +and the model predicts +all of them at once. + +124 +00:05:11,130 --> 00:05:12,780 +So, it tries to kinda fill in the blanks + +125 +00:05:12,780 --> 00:05:14,790 +for those masked tokens. + +126 +00:05:14,790 --> 00:05:18,210 +But the process of random +masking is surprisingly complex. + +127 +00:05:18,210 --> 00:05:21,330 +If we follow the protocol +from the original BERT paper, + +128 +00:05:21,330 --> 00:05:23,970 +we need to replace some +tokens with a masked token, + +129 +00:05:23,970 --> 00:05:26,190 +some other tokens with a random token, + +130 +00:05:26,190 --> 00:05:29,820 +and then keep a third +set of tokens unchanged. + +131 +00:05:29,820 --> 00:05:30,840 +Yeah, this is not the lecture + +132 +00:05:30,840 --> 00:05:33,903 +to go into the specifics +of that or why we do it. + +133 +00:05:33,903 --> 00:05:36,660 +You can always check out +the original BERT paper + +134 +00:05:36,660 --> 00:05:37,493 +if you're curious. + +135 +00:05:37,493 --> 00:05:39,620 +It's well written. It's +easy to understand. + +136 +00:05:40,650 --> 00:05:44,190 +The main thing to know here +is that it can be a real pain + +137 +00:05:44,190 --> 00:05:46,770 +and quite complex to +implement that yourself. + +138 +00:05:46,770 --> 00:05:49,740 +But DataCollatorForLanguageModeling +will do it for you + +139 +00:05:49,740 --> 00:05:51,750 +when you set mlm to True. + +140 +00:05:51,750 --> 00:05:54,690 +And that's an example +of the more intricate + +141 +00:05:54,690 --> 00:05:57,870 +preprocessing that some +of our data collators do. + +142 +00:05:57,870 --> 00:05:59,430 +And that's it! + +143 +00:05:59,430 --> 00:06:01,920 +So, this covers the most +commonly used data collators + +144 +00:06:01,920 --> 00:06:03,480 +and the tasks they're used for. + +145 +00:06:03,480 --> 00:06:06,990 +And hopefully, now you'll know +when to use data collators + +146 +00:06:06,990 --> 00:06:10,833 +and which one to choose +for your specific task. + +147 +00:06:11,765 --> 00:06:14,598 +(whooshing sound) + diff --git a/subtitles/en/69_what-to-do-when-you-get-an-error.srt b/subtitles/en/69_what-to-do-when-you-get-an-error.srt index 6a3df254f..f3907616f 100644 --- a/subtitles/en/69_what-to-do-when-you-get-an-error.srt +++ b/subtitles/en/69_what-to-do-when-you-get-an-error.srt @@ -1,134 +1,271 @@ -1 -00:00:05,440 --> 00:00:13,760 -In this video, we will learn the first things  -to do when you get an error. Let's say we want   - -2 -00:00:13,760 --> 00:00:18,320 -to use the question answering pipeline on  -a particular model and we get the following   - -3 -00:00:18,320 --> 00:00:24,160 -error. Errors in Python can appear overwhelming  -because you get so much information printed out,   - -4 -00:00:24,160 --> 00:00:28,160 -but that's because Python is trying to help  -you the best it can to solve your problem.   - -5 -00:00:28,880 --> 00:00:32,000 -In this video we will see how to  -interpret the error report we get.   - -6 -00:00:33,280 --> 00:00:37,920 -The first thing to notice at the very top is  -that Python shows you with a clear arrow the   - -7 -00:00:37,920 --> 00:00:42,400 -line of code that triggered the error. So you  -don't have to fiddle with your code and remove   - -8 -00:00:42,400 --> 00:00:47,520 -random lines to figure out where the error comes  -from, you have the answer in front right here.   - -9 -00:00:48,880 --> 00:00:53,280 -The arrows you see below are the parts of the  -code Python tried to execute while running the   - -10 -00:00:53,280 --> 00:00:59,600 -instruction: here we are inside the pipeline  -function and the error came on this line while   - -11 -00:00:59,600 --> 00:01:04,800 -trying to execute the function check_tasks,  -which then raised the KeyError we see displayed.   - -12 -00:01:06,480 --> 00:01:11,600 -Note that Python tells you exactly where the  -functions it's executing live, so if you feel   - -13 -00:01:11,600 --> 00:01:17,680 -adventurous, you can even go inspect the source  -code. This whole thing is called the traceback.   - -14 -00:01:19,840 --> 00:01:23,600 -If you are running your code on Colab,  -the Traceback is automatically minimized,   - -15 -00:01:23,600 --> 00:01:29,920 -so you have to click to expand it. At the very  -end of the traceback, you finally get the actual   - -16 -00:01:29,920 --> 00:01:34,960 -error message. The first thing you should do  -when encountering an error is to read that   - -17 -00:01:34,960 --> 00:01:40,640 -error message. Here it's telling us it doesn't  -know the question answering task, and helpfully   - -18 -00:01:40,640 --> 00:01:46,560 -gives us the list of supported tasks... in  -which we can see that question answering is.   - -19 -00:01:47,280 --> 00:01:51,680 -Looking more closely though, we used  -an underscore to separate the two words   - -20 -00:01:51,680 --> 00:01:55,040 -when the task is written with  -a minus, so we should fix that!   - -21 -00:01:57,280 --> 00:02:02,160 -Now let's retry our code with the task properly  -written and what is happening today? Another   - -22 -00:02:02,160 --> 00:02:08,000 -error! As we saw before, we go look at the bottom  -to read the actual error message. It's telling us   - -23 -00:02:08,000 --> 00:02:13,600 -that we should check our model is a correct model  -identifier, so let's hop on to hf.co/models.   - -24 -00:02:14,480 --> 00:02:18,320 -We can see our model listed there in the  -ones available for question answering.   - -25 -00:02:19,120 --> 00:02:22,480 -The difference is that it's  -spelled distilbert with one l,   - -26 -00:02:22,480 --> 00:02:28,960 -and we used two. So let's fix that. We finally  -get our results! If your error is more complex,   - -27 -00:02:28,960 --> 00:02:35,840 -you might need to use the Python debugger,  -check out the videos linked below to learn how! +1 +00:00:00,380 --> 00:00:02,463 +(whoosh) + +2 +00:00:05,550 --> 00:00:07,590 +- In this video we'll +learn the first things to + +3 +00:00:07,590 --> 00:00:09,330 +do when you get an error. + +4 +00:00:09,330 --> 00:00:11,930 +This is not throwing your +laptop through the window. + +5 +00:00:13,320 --> 00:00:15,450 +Let's say we want to use the +question answering pipeline + +6 +00:00:15,450 --> 00:00:19,470 +on a particular model and +we get the following error. + +7 +00:00:19,470 --> 00:00:21,750 +Errors in Python can appear overwhelming + +8 +00:00:21,750 --> 00:00:24,390 +because you get so much +information printed out + +9 +00:00:24,390 --> 00:00:26,610 +but that's because Python +is trying to help you + +10 +00:00:26,610 --> 00:00:29,070 +the best it can to solve your problem. + +11 +00:00:29,070 --> 00:00:31,260 +In this video, we'll see how to interpret + +12 +00:00:31,260 --> 00:00:32,460 +the error report we get. + +13 +00:00:33,510 --> 00:00:35,700 +The first thing to notice at the very top + +14 +00:00:35,700 --> 00:00:38,070 +is that Python shows +you with a clear arrow + +15 +00:00:38,070 --> 00:00:40,320 +the line of code that triggers the error + +16 +00:00:40,320 --> 00:00:42,210 +so you don't have to fiddle with your code + +17 +00:00:42,210 --> 00:00:43,800 +and remove random lines to figure out + +18 +00:00:43,800 --> 00:00:45,540 +where the error comes from. + +19 +00:00:45,540 --> 00:00:47,890 +You have the answer in +front of you right here. + +20 +00:00:49,140 --> 00:00:51,360 +The errors you see below +are a part of the code + +21 +00:00:51,360 --> 00:00:54,930 +Python tried to execute while +running the instruction. + +22 +00:00:54,930 --> 00:00:57,750 +Here we are inside the pipeline function + +23 +00:00:57,750 --> 00:00:59,490 +and zero came on this line + +24 +00:00:59,490 --> 00:01:02,520 +while trying to execute +the function "check_tasks," + +25 +00:01:02,520 --> 00:01:05,103 +which then raised the +KeyError we see displayed. + +26 +00:01:06,630 --> 00:01:08,580 +Note that Python tells you exactly + +27 +00:01:08,580 --> 00:01:11,190 +where the function it's executing lives, + +28 +00:01:11,190 --> 00:01:12,810 +so if you feel adventurous + +29 +00:01:12,810 --> 00:01:14,810 +you can even go inspect the source code. + +30 +00:01:15,900 --> 00:01:18,447 +This whole thing is +called the "Traceback." + +31 +00:01:20,010 --> 00:01:21,870 +If you're running your code on Colab + +32 +00:01:21,870 --> 00:01:23,820 +the Traceback is automatically minimized, + +33 +00:01:23,820 --> 00:01:25,833 +so you have to click to expand it. + +34 +00:01:26,820 --> 00:01:28,530 +At the very end of the Traceback + +35 +00:01:28,530 --> 00:01:31,890 +you finally get the actual error message. + +36 +00:01:31,890 --> 00:01:33,660 +The first thing you should +do when encountering + +37 +00:01:33,660 --> 00:01:36,480 +an error is to read that error message. + +38 +00:01:36,480 --> 00:01:38,640 +Here it's telling us it doesn't know + +39 +00:01:38,640 --> 00:01:40,230 +the question answering task + +40 +00:01:40,230 --> 00:01:41,760 +and helpfully gives us the list + +41 +00:01:41,760 --> 00:01:44,850 +of supported tasks in which we can see + +42 +00:01:44,850 --> 00:01:47,520 +that "question-answering" actually is. + +43 +00:01:47,520 --> 00:01:49,200 +Looking more closely though, + +44 +00:01:49,200 --> 00:01:52,020 +we used an underscore to +surprise the two words + +45 +00:01:52,020 --> 00:01:54,300 +when the task is written with a minus, + +46 +00:01:54,300 --> 00:01:55,413 +so we should fix that. + +47 +00:01:57,510 --> 00:02:00,360 +Now let's retry our code with +the tags properly written + +48 +00:02:00,360 --> 00:02:01,920 +and what is happening today? + +49 +00:02:01,920 --> 00:02:03,210 +Another error. + +50 +00:02:03,210 --> 00:02:05,670 +As we said before, we +go look at the bottom + +51 +00:02:05,670 --> 00:02:07,560 +to read the actual error message. + +52 +00:02:07,560 --> 00:02:09,000 +It's telling us that we should check + +53 +00:02:09,000 --> 00:02:11,340 +our model is a correct model identifier, + +54 +00:02:11,340 --> 00:02:14,760 +so let's hop onto hf.co/models. + +55 +00:02:14,760 --> 00:02:16,440 +We can see our model listed there + +56 +00:02:16,440 --> 00:02:19,440 +in the ones available +for question answering. + +57 +00:02:19,440 --> 00:02:21,720 +The difference is that +it's spelled "distilbert" + +58 +00:02:21,720 --> 00:02:24,240 +with one L, and we use two, + +59 +00:02:24,240 --> 00:02:25,650 +so let's fix that. + +60 +00:02:25,650 --> 00:02:27,570 +We finally get our results. + +61 +00:02:27,570 --> 00:02:29,160 +If our error is more complex, + +62 +00:02:29,160 --> 00:02:31,290 +you might need to use the Python debugger. + +63 +00:02:31,290 --> 00:02:33,483 +Check out the videos below to learn how. + diff --git a/subtitles/en/70_using-a-debugger-in-a-notebook.srt b/subtitles/en/70_using-a-debugger-in-a-notebook.srt index 99ffa2f4b..f543f4b30 100644 --- a/subtitles/en/70_using-a-debugger-in-a-notebook.srt +++ b/subtitles/en/70_using-a-debugger-in-a-notebook.srt @@ -1,132 +1,319 @@ -1 -00:00:05,280 --> 00:00:11,760 -Using the Python debugger in a notebook. In  -this video, we'll learn how to use the Python   - -2 -00:00:11,760 --> 00:00:17,040 -debugger in a Jupyter Notebook or a Colab.  -For this example, we are running code from   - -3 -00:00:17,040 --> 00:00:24,640 -the token classification section, downloading  -the Conll dataset , having a look at it   - -4 -00:00:27,600 --> 00:00:30,080 -before loading a tokenizer to preprocess it.   - -5 -00:00:32,640 --> 00:00:35,440 -Checkout the section of the course  -linked below for more information.   - -6 -00:00:36,800 --> 00:00:42,240 -Once this is done, we try to batch together some  -features of the training dataset by padding them   - -7 -00:00:44,960 --> 00:00:51,280 -and returning a tensor, then we get the  -following error. We use PyTorch here but   - -8 -00:00:51,280 --> 00:00:56,240 -you will get the same error with TensorFlow. As we  -have seen in the "How to debug an error?" video,   - -9 -00:00:56,240 --> 00:01:02,480 -the error message is at the end and it indicates  -we should use padding, which we are actually   - -10 -00:01:02,480 --> 00:01:07,680 -trying to do. So this is not useful and we will  -need to go a little deeper to debug the problem.   - -11 -00:01:08,400 --> 00:01:13,040 -Fortunately, you can use the Python debugger at  -any time you get an error in a Jupyter Notebook   - -12 -00:01:13,040 --> 00:01:23,680 -by typing %debug in any cell. When executing that  -cell, you go to the very bottom of the traceback   - -13 -00:01:23,680 --> 00:01:28,560 -where you can type commands and you can type  -commands. The first two commands you should   - -14 -00:01:28,560 --> 00:01:41,760 -learn are u and d (for up and down), which  -allow you to go up in the Traceback or down.   - -15 -00:01:43,920 --> 00:01:46,720 -Going up twice, we get to the  -point the error was reached.   - -16 -00:01:47,600 --> 00:01:53,840 -The third command to learn is p, for print.  -It allows you to print any value you want.   - -17 -00:01:54,560 --> 00:02:00,720 -For instance here, we can see the value  -of return_tensors or batch_outputs   - -18 -00:02:00,720 --> 00:02:11,520 -to try to understand what triggered the error.  -The batch outputs dictionary is a bit hard to see,   - -19 -00:02:12,720 --> 00:02:18,160 -so let's dive into smaller pieces of it. Inside  -the debugger you can not only print any variable   - -20 -00:02:18,160 --> 00:02:28,240 -but also evaluate any expression, so we can  -look independently at the inputs or labels.   - -21 -00:02:35,440 --> 00:02:41,360 -Those labels are definitely weird: they are of  -various size, which we can actually confirm by   - -22 -00:02:41,360 --> 00:02:49,840 -printing the sizes. No wonder the tokenizer  -wasn't able to create a tensor with them!   - -23 -00:02:52,160 --> 00:02:56,880 -This is because the pad method only  -takes care of the tokenizer outptus:   - -24 -00:02:56,880 --> 00:02:59,680 -input IDs, attention mask and token type IDs,   - -25 -00:03:00,240 --> 00:03:03,840 -so we have to pad the labels ourselves  -before trying to create a tensor with them.   - -26 -00:03:05,040 --> 00:03:11,440 -Once you are ready to exit the Python debugger,  -you can press q for quit. One way to fix the error   - -27 -00:03:11,440 --> 00:03:21,600 -is to manually pad all labels to the longest, or  -we can use the data collator designed for this. +1 +00:00:05,400 --> 00:00:08,150 +- [Instructor] Using the +Python debugger in a notebook. + +2 +00:00:09,540 --> 00:00:12,330 +In this video, we'll learn +how to use the Python debugger + +3 +00:00:12,330 --> 00:00:15,027 +in a Jupyter Notebook or a Colab. + +4 +00:00:15,027 --> 00:00:17,070 +For this example, we are running code + +5 +00:00:17,070 --> 00:00:19,775 +from the token classification section, + +6 +00:00:19,775 --> 00:00:21,513 +downloading the Conll dataset, + +7 +00:00:23,670 --> 00:00:25,503 +looking a little bit at data, + +8 +00:00:27,840 --> 00:00:29,250 +before loading a tokenizer + +9 +00:00:29,250 --> 00:00:31,173 +to preprocess the whole dataset. + +10 +00:00:32,880 --> 00:00:34,740 +Check out the section of +the course linked below + +11 +00:00:34,740 --> 00:00:35,823 +for more information. + +12 +00:00:37,080 --> 00:00:38,520 +Once this is done, + +13 +00:00:38,520 --> 00:00:41,580 +we try to load eight features +of the training dataset, + +14 +00:00:41,580 --> 00:00:43,080 +and then batch them together, + +15 +00:00:43,080 --> 00:00:45,210 +using tokenizer.pad, + +16 +00:00:45,210 --> 00:00:46,760 +and we get the following error. + +17 +00:00:48,090 --> 00:00:49,230 +We use PyTorch here, + +18 +00:00:49,230 --> 00:00:51,330 +with return_tensors="pt" + +19 +00:00:51,330 --> 00:00:53,273 +but you will get the same +error with TensorFlow. + +20 +00:00:54,120 --> 00:00:55,897 +As we have seen in the "How +to debug an error?" video, + +21 +00:00:55,897 --> 00:00:59,160 +the error message is at +the end of the traceback. + +22 +00:00:59,160 --> 00:01:01,710 +Here, it indicates us +we should use padding, + +23 +00:01:01,710 --> 00:01:04,290 +which we are actually trying to do. + +24 +00:01:04,290 --> 00:01:05,610 +So this is not useful at all, + +25 +00:01:05,610 --> 00:01:06,990 +and we will need to go a little deeper + +26 +00:01:06,990 --> 00:01:08,610 +to debug the problem. + +27 +00:01:08,610 --> 00:01:10,650 +Fortunately, you can +use the Python debugger + +28 +00:01:10,650 --> 00:01:13,170 +at any time you get an +error in a Jupyter Notebook + +29 +00:01:13,170 --> 00:01:16,350 +by typing the magic +command, debug, in a cell. + +30 +00:01:16,350 --> 00:01:18,450 +Don't forget the percent at the beginning. + +31 +00:01:20,400 --> 00:01:21,870 +When executing that cell, + +32 +00:01:21,870 --> 00:01:23,910 +you go to the very bottom of the traceback + +33 +00:01:23,910 --> 00:01:25,320 +where you can type commands + +34 +00:01:25,320 --> 00:01:27,690 +that will help you debug your script. + +35 +00:01:27,690 --> 00:01:29,250 +The first two commands you should learn, + +36 +00:01:29,250 --> 00:01:32,040 +are u and d, for up and down. + +37 +00:01:32,040 --> 00:01:36,090 +Typing u and enter will +take you up one step + +38 +00:01:36,090 --> 00:01:38,910 +in the traceback to the +previous instruction. + +39 +00:01:38,910 --> 00:01:41,190 +Typing d and then enter will take you + +40 +00:01:41,190 --> 00:01:43,023 +one step down in the traceback. + +41 +00:01:44,130 --> 00:01:47,910 +Going up twice, we get to the +point the error was reached. + +42 +00:01:47,910 --> 00:01:51,510 +The third command to learn for +the debugger is p, for print. + +43 +00:01:51,510 --> 00:01:54,780 +It allows you to print any value you want. + +44 +00:01:54,780 --> 00:01:58,740 +For instance, typing p +return_tensors and enter, + +45 +00:01:58,740 --> 00:02:02,893 +we see the value pt that we +pass to the bad function. + +46 +00:02:02,893 --> 00:02:05,370 +We can also have a look +at the batch outputs + +47 +00:02:05,370 --> 00:02:07,353 +this batch line coding object gets. + +48 +00:02:09,480 --> 00:02:12,600 +The batch outputs dictionary +is a bit hard to dig in to, + +49 +00:02:12,600 --> 00:02:15,360 +so let's dive into smaller pieces of it. + +50 +00:02:15,360 --> 00:02:18,390 +Inside the debugger you can +not only print any variable + +51 +00:02:18,390 --> 00:02:20,970 +but also evaluate any expression, + +52 +00:02:20,970 --> 00:02:23,610 +for instance, we can have a +look at the input_ids keys + +53 +00:02:23,610 --> 00:02:25,203 +this batch_outputs object. + +54 +00:02:27,600 --> 00:02:30,693 +Or at the labels keys of +this batch_outputs object. + +55 +00:02:35,730 --> 00:02:37,320 +Those labels are definitely weird: + +56 +00:02:37,320 --> 00:02:38,970 +they are of various sizes, + +57 +00:02:38,970 --> 00:02:41,340 +which we can actually confirm, if we want, + +58 +00:02:41,340 --> 00:02:43,983 +by printing the size with +the least compression. + +59 +00:02:52,290 --> 00:02:54,913 +This is because the pad +method of the tokenizer + +60 +00:02:54,913 --> 00:02:57,090 +only takes care of the tokenizer outputs: + +61 +00:02:57,090 --> 00:03:00,450 +input IDs, attention +mask, and token type IDs, + +62 +00:03:00,450 --> 00:03:02,340 +so we have to pad the labels ourselves + +63 +00:03:02,340 --> 00:03:05,310 +before trying to create +a tensor with them. + +64 +00:03:05,310 --> 00:03:07,260 +Once you are ready to +exit the Python debugger, + +65 +00:03:07,260 --> 00:03:09,453 +you can press q and enter for quit. + +66 +00:03:10,320 --> 00:03:11,670 +One way to fix the error + +67 +00:03:11,670 --> 00:03:14,313 +is to manually pad the +labels to the longest. + +68 +00:03:15,300 --> 00:03:17,400 +Another way is to use a data collator + +69 +00:03:17,400 --> 00:03:19,863 +specifically designed +for token classification. + +70 +00:03:20,970 --> 00:03:22,950 +You can also use a +Python debugger directly + +71 +00:03:22,950 --> 00:03:23,850 +in the terminal. + +72 +00:03:23,850 --> 00:03:25,943 +Check out the video +link below to learn how. + diff --git a/subtitles/en/71_using-a-debugger-in-a-terminal.srt b/subtitles/en/71_using-a-debugger-in-a-terminal.srt index 0971982b8..acc0af0be 100644 --- a/subtitles/en/71_using-a-debugger-in-a-terminal.srt +++ b/subtitles/en/71_using-a-debugger-in-a-terminal.srt @@ -1,153 +1,350 @@ -1 -00:00:05,840 --> 00:00:11,520 -Using the Python debugger in a terminal. In this  -video, we'll learn how to use the Python debugger   - -2 -00:00:11,520 --> 00:00:16,800 -in a terminal. For this example, we are running  -code from the token classification section,   - -3 -00:00:17,600 --> 00:00:22,320 -downloading the Conll dataset before  -loading a tokenizer to preprocess it.   - -4 -00:00:23,200 --> 00:00:28,720 -Checkout the section of the course linked below  -for more information. Once this is done, we try   - -5 -00:00:28,720 --> 00:00:34,240 -to batch together some features of the training  -dataset by padding them and returning a tensor,   - -6 -00:00:37,200 --> 00:00:40,160 -then we get the following error.   - -7 -00:00:42,800 --> 00:00:47,280 -We use PyTorch here but you will  -get the same error with TensorFlow.   - -8 -00:00:49,280 --> 00:00:53,680 -As we have seen in the "How to debug an  -error?" video, the error message is at the end   - -9 -00:00:53,680 --> 00:00:58,640 -and it indicates we should use padding... which  -we are actually trying to do. So this is not   - -10 -00:00:58,640 --> 00:01:03,360 -useful and we will need to go a little deeper  -to debug the problem. Fortunately, you can use   - -11 -00:01:03,360 --> 00:01:10,400 -the Python debugger quite easily in a terminal by  -launching your script with python -m pdb instead   - -12 -00:01:10,400 --> 00:01:17,200 -of just python. When executing that command, you  -are sent to the first instruction of your script.   - -13 -00:01:17,200 --> 00:01:25,840 -You can run just the next instruction by typing  -n, or continue to the error by directly typing c.   - -14 -00:01:29,680 --> 00:01:33,120 -Once there, you go to the very bottom of  -the traceback, and you can type commands.   - -15 -00:01:34,000 --> 00:01:40,160 -The first two commands you should learn are u and  -d (for up and down), which allow you to go up in   - -16 -00:01:40,160 --> 00:01:48,320 -the Traceback or down. Going up twice, we get to  -the point the error was reached. The third command   - -17 -00:01:48,320 --> 00:01:54,000 -to learn is p, for print. It allows you to print  -any value you want. For instance here, we can see   - -18 -00:01:54,000 --> 00:01:59,120 -the value of return_tensors or batch_outputs  -to try to understand what triggered the error.   - -19 -00:02:00,000 --> 00:02:04,720 -The batch outputs dictionary is a bit hard to  -see, so let's dive into smaller pieces of it.   - -20 -00:02:05,360 --> 00:02:10,560 -Inside the debugger you can not only print  -any variable but also evaluate any expression,   - -21 -00:02:10,560 --> 00:02:23,600 -so we can look independently at the inputs  -or labels. Those labels are definitely weird:   - -22 -00:02:24,160 --> 00:02:27,920 -they are of various size, which we can  -actually confirm by printing the sizes.   - -23 -00:02:35,760 --> 00:02:40,160 -No wonder the tokenizer wasn't able to create  -a tensor with them! This is because the pad   - -24 -00:02:40,160 --> 00:02:45,840 -method only takes care of the tokenizer outputs:  -input IDs, attention mask and token type IDs,   - -25 -00:02:46,400 --> 00:02:50,080 -so we have to pad the labels ourselves  -before trying to create a tensor with them.   - -26 -00:02:51,120 --> 00:02:56,880 -Once you are ready to exit the Python  -debugger, you can press q for quit. Another   - -27 -00:02:56,880 --> 00:03:03,600 -way we can access the Python debugger is to set  -a "set_trace" instruction where we want in the   - -28 -00:03:10,480 --> 00:03:23,280 -script. It will interrupt the execution and  -launch the Python debugger at this place, and we   - -29 -00:03:23,280 --> 00:03:32,080 -can inspect all the variables before the next  -instruction is executed. Typing n executes the   - -30 -00:03:32,080 --> 00:03:37,280 -next instruction, which takes us back inside  -the traceback. One way to fix the error is   - -31 -00:03:37,280 --> 00:03:49,760 -to manually pad all labels to the longest, or  -we can use the data collator designed for this. +1 +00:00:00,459 --> 00:00:03,542 +(wind swiping sound) + +2 +00:00:05,880 --> 00:00:08,910 +- [Instructor] Using the +Python debugger in a terminal. + +3 +00:00:08,910 --> 00:00:11,580 +In this video, we'll learn +how to use a Python debugger + +4 +00:00:11,580 --> 00:00:13,140 +in a terminal. + +5 +00:00:13,140 --> 00:00:15,390 +For this example, we're running code + +6 +00:00:15,390 --> 00:00:17,760 +from the token classification section, + +7 +00:00:17,760 --> 00:00:19,950 +downloading the Conll dataset + +8 +00:00:19,950 --> 00:00:23,340 +before loading a tokenizer +to pre-process it. + +9 +00:00:23,340 --> 00:00:25,140 +Check out the section +of the course link below + +10 +00:00:25,140 --> 00:00:26,223 +for more information. + +11 +00:00:27,600 --> 00:00:28,500 +Once this is done, + +12 +00:00:28,500 --> 00:00:30,630 +we try to batch together some features + +13 +00:00:30,630 --> 00:00:33,180 +of the training dataset by padding them + +14 +00:00:33,180 --> 00:00:34,330 +and returning a tensor. + +15 +00:00:36,810 --> 00:00:39,510 +If we try to execute our +scripts in a terminal + +16 +00:00:39,510 --> 00:00:40,413 +we get an error. + +17 +00:00:42,630 --> 00:00:44,260 +Note that we use PyTorch here + +18 +00:00:44,260 --> 00:00:45,600 +we return tensors equal pity. + +19 +00:00:45,600 --> 00:00:47,753 +But you would get the same +error with TensorFlow. + +20 +00:00:49,500 --> 00:00:51,990 +As we have seen in the, 'How +to debug an error?' video, + +21 +00:00:51,990 --> 00:00:54,780 +The raw message is at the +end and it indicates we + +22 +00:00:54,780 --> 00:00:58,260 +should use pairing, which +we're actually trying to do. + +23 +00:00:58,260 --> 00:01:00,630 +So this is not useful and +we need to go little deeper + +24 +00:01:00,630 --> 00:01:02,310 +to debug the problem. + +25 +00:01:02,310 --> 00:01:04,830 +Fortunately, you can use the +Python debugger quite easily + +26 +00:01:04,830 --> 00:01:09,830 +in a terminal by launching +your script with Python -m PDB + +27 +00:01:09,930 --> 00:01:11,980 +and then the name of the training script. + +28 +00:01:13,410 --> 00:01:15,030 +When executing that comment, you are sent + +29 +00:01:15,030 --> 00:01:17,340 +to the first instruction of your script. + +30 +00:01:17,340 --> 00:01:20,733 +You can run just the next +instruction by typing N and enter. + +31 +00:01:22,530 --> 00:01:27,423 +Or you can continue directly +to zero by typing C and enter. + +32 +00:01:29,850 --> 00:01:31,560 +Once there, you go to the very bottom + +33 +00:01:31,560 --> 00:01:34,050 +of the traceback and +you can type commands. + +34 +00:01:34,050 --> 00:01:36,360 +The first two commands you +should learn are U and D, + +35 +00:01:36,360 --> 00:01:38,160 +for up and down. + +36 +00:01:38,160 --> 00:01:41,223 +This allows you to get up +and down in the traceback. + +37 +00:01:42,990 --> 00:01:46,623 +Going up twice, we get to the +point the error was reached. + +38 +00:01:47,910 --> 00:01:50,190 +The first command to learn is P for print. + +39 +00:01:50,190 --> 00:01:52,830 +It allows you to print any value you want. + +40 +00:01:52,830 --> 00:01:56,280 +For instance, here we can see +the value of return_tensors + +41 +00:01:56,280 --> 00:02:00,210 +or batch_outputs to try to +understand what triggered zero. + +42 +00:02:00,210 --> 00:02:03,000 +The batch outputs dictionary +is a bit hard to see + +43 +00:02:03,000 --> 00:02:05,520 +so let's dive into smaller pieces of it. + +44 +00:02:05,520 --> 00:02:08,460 +Inside the debugger, you can +not only print any variable + +45 +00:02:08,460 --> 00:02:10,740 +but also evaluate any expression, + +46 +00:02:10,740 --> 00:02:13,713 +so we can look +independently at the inputs. + +47 +00:02:15,060 --> 00:02:15,993 +Also labels. + +48 +00:02:22,350 --> 00:02:24,300 +Those labels are definitely weird. + +49 +00:02:24,300 --> 00:02:26,880 +They are various size, +which we can confirm + +50 +00:02:26,880 --> 00:02:29,553 +by printing the sites using +a release compression. + +51 +00:02:35,880 --> 00:02:37,800 +No wonder the tokenizer +wasn't able to create + +52 +00:02:37,800 --> 00:02:39,270 +a tensor with them. + +53 +00:02:39,270 --> 00:02:41,460 +This is because the pad +method only takes care + +54 +00:02:41,460 --> 00:02:44,850 +of the tokenizer outputs, the +input IDs, the attention mask + +55 +00:02:44,850 --> 00:02:46,560 +and the token type IDs. + +56 +00:02:46,560 --> 00:02:48,390 +So we have to pad the level ourselves + +57 +00:02:48,390 --> 00:02:51,300 +before trying to create +a new sensor with them. + +58 +00:02:51,300 --> 00:02:54,030 +Once you're ready to +execute the Python debugger, + +59 +00:02:54,030 --> 00:02:56,640 +you can press Q for quit and enter. + +60 +00:02:56,640 --> 00:02:59,790 +Another way we can access +the Python debugger, + +61 +00:02:59,790 --> 00:03:02,310 +is to put a breaking point in our script. + +62 +00:03:02,310 --> 00:03:05,913 +We can do this using the +PDB that set_trace method. + +63 +00:03:07,920 --> 00:03:09,870 +As long as we import the PDB module + +64 +00:03:09,870 --> 00:03:11,420 +at the beginning of our script. + +65 +00:03:12,510 --> 00:03:17,283 +Saving and then relaunching +our script, with just Python. + +66 +00:03:19,710 --> 00:03:23,310 +We'll stop the execution at +the breaking point we set. + +67 +00:03:23,310 --> 00:03:24,660 +We can inspect all the variable + +68 +00:03:24,660 --> 00:03:27,030 +before the next instruction +is executed again. + +69 +00:03:27,030 --> 00:03:29,253 +For instance, here, the features. + +70 +00:03:30,270 --> 00:03:33,090 +Typing N and enter execute +the next instruction + +71 +00:03:33,090 --> 00:03:35,700 +which takes us back inside traceback. + +72 +00:03:35,700 --> 00:03:37,530 +When going to fix zero manually is to + +73 +00:03:37,530 --> 00:03:39,873 +pad all the labels to the longest. + +74 +00:03:42,000 --> 00:03:45,120 +Another way is to use +the data creator suitable + +75 +00:03:45,120 --> 00:03:46,443 +for token classification. + +76 +00:03:48,330 --> 00:03:50,340 +If you want to learn how to use the Python + +77 +00:03:50,340 --> 00:03:53,273 +debugger in a notebook, check +out the video in link below. + +78 +00:03:54,698 --> 00:03:57,781 +(wind swiping sound) + diff --git a/subtitles/en/72_asking-for-help-on-the-forums.srt b/subtitles/en/72_asking-for-help-on-the-forums.srt index 40ba6dbc9..e34751109 100644 --- a/subtitles/en/72_asking-for-help-on-the-forums.srt +++ b/subtitles/en/72_asking-for-help-on-the-forums.srt @@ -1,174 +1,346 @@ -1 -00:00:05,520 --> 00:00:08,080 -How to ask a question on the Hugging Face forums? - -2 -00:00:09,840 --> 00:00:15,360 -If you have a general question or are looking to  -debug your code, the forums are the place to ask.   - -3 -00:00:15,360 --> 00:00:17,760 -In this video we will teach you  -how to write a good question,   - -4 -00:00:17,760 --> 00:00:20,080 -to maximize the chances you will get an answer. - -5 -00:00:21,360 --> 00:00:25,120 -First things first, to login on the  -forums, you need a Hugging Face account.   - -6 -00:00:25,680 --> 00:00:32,560 -If you haven't created one yet, go to hf.co and  -click Sign Up. There is also a direct link below. - -7 -00:00:33,520 --> 00:00:34,880 -Fill your email and password,   - -8 -00:00:34,880 --> 00:00:38,480 -then continue the steps ot pick a  -username and update a profile picture. - -9 -00:00:39,440 --> 00:00:43,040 -Once this is done, go to discuss.huggingface.co   - -10 -00:00:43,040 --> 00:00:48,480 -(linked below) and click Log In. Use the same  -login information as for the Hugging Face website. - -11 -00:00:49,600 --> 00:00:53,840 -You can search the forums by clicking on the  -magnifying glass. Someone may have already asked   - -12 -00:00:53,840 --> 00:00:59,040 -your question in a topic! If you find you can't  -post a new topic as a new user, it may be because   - -13 -00:00:59,040 --> 00:01:04,480 -of the antispam filters. Make sure you spend some  -time reading existing topics to deactivate it. - -14 -00:01:05,120 --> 00:01:09,440 -When you are sure your question hasn't been  -asked yet, click on the New Topic button. - -15 -00:01:09,440 --> 00:01:11,840 -For this example, we will use the following code, - -16 -00:01:12,400 --> 00:01:16,320 -that produces an error, as we saw in the  -"What to do when I get an error?" video. - -17 -00:01:18,080 --> 00:01:22,400 -The first step is to pick a category for our  -new topic. Since our error has to do with the   - -18 -00:01:22,400 --> 00:01:29,040 -Transformers library, we pick this category. New,  -choose a title that summarizes your error well.   - -19 -00:01:29,680 --> 00:01:33,040 -Don't be too vague or users that get  -the same error you did in the future   - -20 -00:01:33,040 --> 00:01:37,600 -won't be able to find your topic. Once  -you have finished typing your topic,   - -21 -00:01:38,160 --> 00:01:41,680 -make sure the question hasn't been answered  -in the topics Discourse suggests you.   - -22 -00:01:42,480 --> 00:01:44,960 -Click on the cross to remove that  -window when you have double-checked.   - -23 -00:01:46,000 --> 00:01:50,880 -This is an example of what not to do when  -posting an error: the message is very vague   - -24 -00:01:50,880 --> 00:01:55,120 -so no one else will be able to guess what went  -wrong for you, and it tags too many people.   - -25 -00:01:56,320 --> 00:02:00,560 -Tagging people (especially moderators) might  -have the opposite effect of what you want.   - -26 -00:02:01,200 --> 00:02:05,520 -As you send them a notification (and they get  -plenty), they will probably not bother replying   - -27 -00:02:05,520 --> 00:02:10,480 -to you, and users you didn't tag will probably  -ignore the question since they see tagged users.   - -28 -00:02:11,200 --> 00:02:15,520 -Only tag a user when you are completely certain  -they are the best placed to answer your question.   - -29 -00:02:17,520 --> 00:02:21,760 -Be precise in your text, and if you have an  -error coming from a specific piece of code,   - -30 -00:02:21,760 --> 00:02:27,760 -include that code in your post. To make sure your  -post looks good, place your question between three   - -31 -00:02:27,760 --> 00:02:32,720 -backticks like this. You can check on the  -right how your post will appear once posted.   - -32 -00:02:34,080 --> 00:02:39,040 -If your question is about an error, it's even  -better to include the full traceback. As explained   - -33 -00:02:39,040 --> 00:02:44,560 -in the "what to do when I get an error?' video,  -expand the traceback if you are on Colab. like   - -34 -00:02:44,560 --> 00:02:50,320 -for the code, put it between two lines containing  -three backticks for proper formatting. Our last   - -35 -00:02:50,320 --> 00:02:55,120 -advice is to remember to be nice, a please and a  -thank you will go a long way into getting others   - -36 -00:02:55,120 --> 00:03:03,840 -to help you. With all that done properly, your  -question should get an answer pretty quickly! +1 +00:00:00,125 --> 00:00:01,455 +(title whooshes) + +2 +00:00:01,455 --> 00:00:02,789 +(logo pops) + +3 +00:00:02,789 --> 00:00:05,700 +(title whooshes) + +4 +00:00:05,700 --> 00:00:08,433 +- How to ask a question on +the Hugging Face forums? + +5 +00:00:10,020 --> 00:00:11,640 +If you have a general question + +6 +00:00:11,640 --> 00:00:13,110 +or are looking to debug your code, + +7 +00:00:13,110 --> 00:00:15,540 +the forums are the place to ask. + +8 +00:00:15,540 --> 00:00:16,710 +In this video we will teach you + +9 +00:00:16,710 --> 00:00:18,030 +how to write a good question, + +10 +00:00:18,030 --> 00:00:20,380 +to maximize the chances +you will get an answer. + +11 +00:00:21,570 --> 00:00:23,970 +First things first, to +login on the forums, + +12 +00:00:23,970 --> 00:00:25,920 +you need a Hugging Face account. + +13 +00:00:25,920 --> 00:00:27,750 +If you haven't created one already, + +14 +00:00:27,750 --> 00:00:31,080 +go to hf.co and click sign up. + +15 +00:00:31,080 --> 00:00:32,780 +There is also a direct link below. + +16 +00:00:33,750 --> 00:00:35,160 +Fill your email and password, + +17 +00:00:35,160 --> 00:00:37,410 +then continue the steps +to pick your username + +18 +00:00:37,410 --> 00:00:38,860 +and update a profile picture. + +19 +00:00:39,720 --> 00:00:43,200 +Once this is done, go to +discuss.huggingface.co, + +20 +00:00:43,200 --> 00:00:45,630 +link below, and click log in. + +21 +00:00:45,630 --> 00:00:47,033 +Use the same login information as + +22 +00:00:47,033 --> 00:00:48,693 +for the Hugging Face website. + +23 +00:00:49,890 --> 00:00:51,300 +You can search the forums by clicking + +24 +00:00:51,300 --> 00:00:52,800 +on the magnifying glass. + +25 +00:00:52,800 --> 00:00:55,710 +Someone may have already asked +your question in a topic. + +26 +00:00:55,710 --> 00:00:58,260 +If you find you can't post +a new topic as a new user, + +27 +00:00:58,260 --> 00:01:01,290 +it may be because of the antispam filters. + +28 +00:01:01,290 --> 00:01:03,750 +Make sure you spend some +time reading existing topics + +29 +00:01:03,750 --> 00:01:05,370 +to deactivate it. + +30 +00:01:05,370 --> 00:01:07,590 +When you're sure your question +hasn't been asked yet, + +31 +00:01:07,590 --> 00:01:09,660 +click on the new topic button. + +32 +00:01:09,660 --> 00:01:12,600 +For this example, we'll +use the following code, + +33 +00:01:12,600 --> 00:01:13,860 +that produces an error, + +34 +00:01:13,860 --> 00:01:16,660 +as we saw in the "What to do +when I get an error" video. + +35 +00:01:18,330 --> 00:01:21,330 +The first step is to pick a +category for our new topic. + +36 +00:01:21,330 --> 00:01:23,790 +Since our error has to do +with the Transformers library, + +37 +00:01:23,790 --> 00:01:24,903 +we pick this category. + +38 +00:01:26,070 --> 00:01:29,880 +Next, choose a title that +summarizes your error well. + +39 +00:01:29,880 --> 00:01:32,300 +Don't be too vague or users +that get the same error you did + +40 +00:01:32,300 --> 00:01:34,773 +in the future won't be +able to find your topic. + +41 +00:01:36,150 --> 00:01:38,370 +Once you have finished +typing your topic title, + +42 +00:01:38,370 --> 00:01:40,170 +make sure the question +hasn't been answered + +43 +00:01:40,170 --> 00:01:42,690 +in the topics Discourse suggests you. + +44 +00:01:42,690 --> 00:01:44,190 +Click on the cross to remove that window + +45 +00:01:44,190 --> 00:01:46,230 +when you have double-checked. + +46 +00:01:46,230 --> 00:01:49,710 +This is an example of what not +to do when posting an error. + +47 +00:01:49,710 --> 00:01:51,120 +The message is very vague, + +48 +00:01:51,120 --> 00:01:53,370 +so no one else will be able +to guess what went wrong + +49 +00:01:53,370 --> 00:01:55,623 +for you, and it tags too many people. + +50 +00:01:56,490 --> 00:01:58,740 +Tagging people, especially moderators, + +51 +00:01:58,740 --> 00:02:01,470 +might have the opposite +effect of what you want. + +52 +00:02:01,470 --> 00:02:04,380 +As you send them a notification, +and they get plenty, + +53 +00:02:04,380 --> 00:02:06,300 +they will probably not +bother replying to you, + +54 +00:02:06,300 --> 00:02:09,300 +and users you didn't tag will +probably ignore the questions, + +55 +00:02:09,300 --> 00:02:11,430 +since they see tagged users. + +56 +00:02:11,430 --> 00:02:13,697 +Only tag a user when you +are completely certain + +57 +00:02:13,697 --> 00:02:16,097 +they are the best place +to answer your question. + +58 +00:02:17,730 --> 00:02:20,370 +Be precise in your text, and +if you have an error coming + +59 +00:02:20,370 --> 00:02:22,710 +from a specific piece of +code, include that code + +60 +00:02:22,710 --> 00:02:24,030 +in your post. + +61 +00:02:24,030 --> 00:02:27,210 +To make sure your post looks +good, place your question + +62 +00:02:27,210 --> 00:02:30,060 +between three backticks like this. + +63 +00:02:30,060 --> 00:02:30,990 +You can check on the right + +64 +00:02:30,990 --> 00:02:32,943 +how your post will appear once posted. + +65 +00:02:34,320 --> 00:02:35,850 +If your question is about an error, + +66 +00:02:35,850 --> 00:02:38,640 +it's even better to +include the full traceback. + +67 +00:02:38,640 --> 00:02:41,610 +As explained in the "What to +do when I get an error" video, + +68 +00:02:41,610 --> 00:02:43,763 +expand the traceback if you're on Colab. + +69 +00:02:44,769 --> 00:02:45,990 +Like for the code, put it + +70 +00:02:45,990 --> 00:02:48,300 +between two lines +containing three backticks + +71 +00:02:48,300 --> 00:02:50,160 +for proper formatting. + +72 +00:02:50,160 --> 00:02:52,740 +Our last advice is to remember to be nice. + +73 +00:02:52,740 --> 00:02:54,540 +A "Please," and a "Thank +you" will go a long way + +74 +00:02:54,540 --> 00:02:56,490 +into getting others to help you. + +75 +00:02:56,490 --> 00:02:57,780 +With all that done properly, + +76 +00:02:57,780 --> 00:03:00,143 +your question should get +an answer pretty quickly. + +77 +00:03:01,293 --> 00:03:04,344 +(title whooshes) + +78 +00:03:04,344 --> 00:03:06,034 +(title fizzles) + diff --git a/subtitles/en/73_debugging-the-training-pipeline-(pytorch).srt b/subtitles/en/73_debugging-the-training-pipeline-(pytorch).srt index f73f5ef0c..6463b1fc6 100644 --- a/subtitles/en/73_debugging-the-training-pipeline-(pytorch).srt +++ b/subtitles/en/73_debugging-the-training-pipeline-(pytorch).srt @@ -1,214 +1,448 @@ -1 -00:00:06,080 --> 00:00:10,960 -In this video, we will see how to debug an error  -you encounter when running trainer.train().   - -2 -00:00:12,240 --> 00:00:17,280 -As an example, we will use this script that  -finetunes a bert model on the GLUE MNLI dataset.   - -3 -00:00:17,840 --> 00:00:20,880 -Checkout the videos linked below to  -see how we came to such a script,   - -4 -00:00:21,680 --> 00:00:26,960 -here we want to learn how to debug the problems  -in it. Running the script gives us an error pretty   - -5 -00:00:26,960 --> 00:00:31,920 -fast. It happens at the line where we feed the  -inputs to the model, according to the traceback.   - -6 -00:00:32,640 --> 00:00:36,720 -That tells us there is a problem there, but the  -problem could come from many different causes.   - -7 -00:00:37,520 --> 00:00:41,600 -To debug an error in a training, you need to  -make sure each step of the training pipeline   - -8 -00:00:41,600 --> 00:00:46,160 -works as intended. This means checking that  -the inputs of your dataset are correct,   - -9 -00:00:46,800 --> 00:00:50,560 -you can batch them together, feed  -them through the model to get a loss,   - -10 -00:00:50,560 --> 00:00:54,080 -then compute the gradients of that loss  -before performing an optimizer step.   - -11 -00:00:55,280 --> 00:01:00,480 -So let's start by looking at the training dataset  -this Trainer is using. There is definitely a   - -12 -00:01:00,480 --> 00:01:07,040 -problem there as we see texts and not numbers. The  -error message was telling us the model did not get   - -13 -00:01:07,040 --> 00:01:13,120 -input IDs and we do not have those in the dataset  -indeed. Looking back at our code, we can see we   - -14 -00:01:13,120 --> 00:01:18,800 -made a mistake and passed the wrong datasets to  -the Trainer. So let's fix that and run again.   - -15 -00:01:20,240 --> 00:01:25,680 -Now we have a new error. Inspecting the traceback  -tells us it happens when we try to create a batch,   - -16 -00:01:25,680 --> 00:01:31,920 -specifically to group the features in a tensor.  -We can confirm this by asking the Trainer to get   - -17 -00:01:31,920 --> 00:01:37,600 -us a batch of the training data loader, which  -reproduces the same error. Either by inspecting   - -18 -00:01:37,600 --> 00:01:43,600 -the inputs or debugging, we can then see they are  -not all of the same size. This is because we have   - -19 -00:01:43,600 --> 00:01:48,240 -not passed a data collator to do the padding in  -the Trainer and didn't pad when preprocessing   - -20 -00:01:48,240 --> 00:01:53,440 -the data either. Padding inside the Trainer is  -normally the default, but only if you provide   - -21 -00:01:53,440 --> 00:01:58,800 -your tokenizer to the Trainer, and we forgot to  -do that. So let's fix the issue and run again.   - -22 -00:02:00,320 --> 00:02:06,400 -This time we get a nasty CUDA error. They  -are very difficult to debug because for one,   - -23 -00:02:07,120 --> 00:02:11,280 -they put your kernel in a state that is not  -recoverable (so you have to restart your   - -24 -00:02:11,280 --> 00:02:15,840 -notebook from the beginning) and two, the  -traceback is completely useless for those.   - -25 -00:02:16,800 --> 00:02:22,240 -Here the traceback tells us the error happens when  -we do the gradient computation with loss.backward,   - -26 -00:02:22,240 --> 00:02:27,840 -but as we will see later on that is not the  -case. This is because everything that happens   - -27 -00:02:27,840 --> 00:02:33,520 -on the GPU is done asynchronously: when you  -execute the model call, what the program does   - -28 -00:02:33,520 --> 00:02:39,280 -is just stacking that in the queue of GPU, then  -(if the GPU didn't have any current job to do),   - -29 -00:02:39,280 --> 00:02:43,920 -the work will start on the GPU at the same time  -as the CPU will move to the next instruction.   - -30 -00:02:44,800 --> 00:02:50,000 -Continuing with the extraction of the loss, this  -is stacked into the GPU queue while the CPU moves   - -31 -00:02:50,000 --> 00:02:54,960 -to the instruction loss.backward. But the GPU  -still hasn't finished the forward pass of the   - -32 -00:02:54,960 --> 00:03:01,760 -model since all that took no time at all. The CPU  -stops moving forward, because loss.backward as an   - -33 -00:03:01,760 --> 00:03:09,360 -instruction telling it to wait for the GPUs to be  -finished, and when the GPU encounters an error,   - -34 -00:03:09,360 --> 00:03:15,040 -it gives with a cryptic message back to the  -CPU, who raises the error at the wrong place.   - -35 -00:03:16,080 --> 00:03:20,320 -So to debug this, we will need to execute the  -next steps of the training pipeline on the CPU.   - -36 -00:03:20,960 --> 00:03:26,320 -It is very easy to do, and we get a traceback  -we can trust this time. As we said before,   - -37 -00:03:26,320 --> 00:03:32,720 -the error happens during the forward pass  -of the model, and it's an index error.   - -38 -00:03:33,360 --> 00:03:38,800 -With a bit of debugging, we see we have labels  -ranging from 0 to 2, so three different values,   - -39 -00:03:38,800 --> 00:03:44,240 -but our outputs have a shape of batch size per 2.  -It looks like our model has the wrong number of   - -40 -00:03:44,240 --> 00:03:50,320 -labels! We can indeed confirm that, and now that  -we know it's easy to fix it in the code by adding   - -41 -00:03:50,320 --> 00:03:58,720 -num_labels=3 when we create the model. Now the  -training script will run to completion! We did not   - -42 -00:03:58,720 --> 00:04:02,640 -need it yet, but here is how we would debug the  -next step of the pipeline, gradient computation,   - -43 -00:04:03,360 --> 00:04:13,840 -as well as the optimizer step. With all of  -this, good luck debugging your own trainings! +1 +00:00:06,210 --> 00:00:08,760 +- In this video, we will +see how to debug an error + +2 +00:00:08,760 --> 00:00:11,896 +you encounter when running Trainer.train + +3 +00:00:11,896 --> 00:00:15,066 +As an example, we will use +this script that finetunes + +4 +00:00:15,066 --> 00:00:17,760 +a bert model on the GLUE MNLI dataset. + +5 +00:00:17,760 --> 00:00:19,470 +Checkout the videos linked below + +6 +00:00:19,470 --> 00:00:21,840 +to see how we came to such a script. + +7 +00:00:21,840 --> 00:00:24,540 +Here we want to learn how +to debug the problems in it. + +8 +00:00:25,470 --> 00:00:28,110 +Running the script gives +us an error pretty quickly. + +9 +00:00:28,110 --> 00:00:29,040 +It happens at the line + +10 +00:00:29,040 --> 00:00:30,990 +where we feed the inputs to the model, + +11 +00:00:30,990 --> 00:00:32,850 +according to the traceback. + +12 +00:00:32,850 --> 00:00:34,702 +That tells us there is a problem there, + +13 +00:00:34,702 --> 00:00:37,881 +but the problem could come +from many different causes. + +14 +00:00:37,881 --> 00:00:39,330 +To debug an error in a training, + +15 +00:00:39,330 --> 00:00:41,760 +you need to make sure each +step of the training pipeline + +16 +00:00:41,760 --> 00:00:43,440 +works as intended. + +17 +00:00:43,440 --> 00:00:45,780 +This means checking that +the inputs of your dataset + +18 +00:00:45,780 --> 00:00:47,040 +are correct, + +19 +00:00:47,040 --> 00:00:48,720 +you can batch them together, + +20 +00:00:48,720 --> 00:00:50,790 +feed them through the model to get a loss, + +21 +00:00:50,790 --> 00:00:52,500 +then compute the gradients of that loss + +22 +00:00:52,500 --> 00:00:54,303 +before performing an optimizer step. + +23 +00:00:55,470 --> 00:00:57,810 +So let's start by looking +at the training dataset + +24 +00:00:57,810 --> 00:00:59,043 +this Trainer is using. + +25 +00:00:59,910 --> 00:01:02,190 +There is definitely a problem here. + +26 +00:01:02,190 --> 00:01:04,293 +We see texts and not number. + +27 +00:01:05,130 --> 00:01:06,660 +The error message was telling us the model + +28 +00:01:06,660 --> 00:01:08,220 +did not get input IDs + +29 +00:01:08,220 --> 00:01:11,100 +and we do not have those +in the dataset indeed. + +30 +00:01:11,100 --> 00:01:12,660 +Looking back at our code, + +31 +00:01:12,660 --> 00:01:14,400 +we can see we made a mistake + +32 +00:01:14,400 --> 00:01:17,400 +and passed the wrong +datasets to the Trainer. + +33 +00:01:17,400 --> 00:01:19,173 +So let's fix that and run again. + +34 +00:01:20,490 --> 00:01:21,840 +Now we have a new error. + +35 +00:01:21,840 --> 00:01:23,130 +Inspecting the traceback + +36 +00:01:23,130 --> 00:01:25,860 +tells us it happens when +we try to create a batch, + +37 +00:01:25,860 --> 00:01:28,743 +specifically to group +the features in a tensor. + +38 +00:01:29,700 --> 00:01:32,610 +We can confirm this by asking +the Trainer to get us a batch + +39 +00:01:32,610 --> 00:01:34,230 +of the training data loader, + +40 +00:01:34,230 --> 00:01:35,913 +which reproduces the same error. + +41 +00:01:36,780 --> 00:01:39,064 +Either by inspecting +the inputs or debugging, + +42 +00:01:39,064 --> 00:01:42,870 +we can then see they are +not all of the same size. + +43 +00:01:42,870 --> 00:01:45,120 +This is because we have +not passed a data collator + +44 +00:01:45,120 --> 00:01:46,890 +to do the padding to the Trainer + +45 +00:01:46,890 --> 00:01:49,443 +and didn't pad when +preprocessing the data either. + +46 +00:01:50,430 --> 00:01:52,710 +Padding inside the Trainer +is normally the default, + +47 +00:01:52,710 --> 00:01:55,380 +but only if you provide your +tokenizer to the Trainer, + +48 +00:01:55,380 --> 00:01:57,270 +and we forgot to do that. + +49 +00:01:57,270 --> 00:01:59,120 +So let's fix the issue and run again. + +50 +00:02:00,510 --> 00:02:02,883 +This time we get a nasty CUDA error. + +51 +00:02:03,765 --> 00:02:06,285 +They are very difficult +to debug because for one, + +52 +00:02:06,285 --> 00:02:10,530 +they put your kernel in a +state that is not recoverable + +53 +00:02:10,530 --> 00:02:13,260 +so you have to restart your +notebook from the beginning + +54 +00:02:13,260 --> 00:02:16,950 +and two, the traceback is +completely useless for those. + +55 +00:02:16,950 --> 00:02:19,230 +Here the traceback tells +us the error happens + +56 +00:02:19,230 --> 00:02:22,500 +when we do the gradient +computation with loss.backward, + +57 +00:02:22,500 --> 00:02:25,113 +but as we will see later +on that is not the case. + +58 +00:02:26,520 --> 00:02:28,920 +This is because everything +that happens on the GPU + +59 +00:02:28,920 --> 00:02:30,720 +is done asynchronously. + +60 +00:02:30,720 --> 00:02:32,880 +When you execute the model call, + +61 +00:02:32,880 --> 00:02:34,457 +what the program does +is just stacking that + +62 +00:02:34,457 --> 00:02:36,600 +in the queue of GPU, + +63 +00:02:36,600 --> 00:02:39,856 +then if the GPU didn't +have any current job to do, + +64 +00:02:39,856 --> 00:02:41,850 +the work will start on +the GPU at the same time + +65 +00:02:41,850 --> 00:02:45,000 +as the CPU moves to the next instruction. + +66 +00:02:45,000 --> 00:02:47,040 +Continuing with the +extraction of the loss, + +67 +00:02:47,040 --> 00:02:49,170 +this is stacked into the GPU queue + +68 +00:02:49,170 --> 00:02:51,953 +while the CPU moves to the +instruction loss.backward. + +69 +00:02:51,953 --> 00:02:54,180 +But the GPU still hasn't finished + +70 +00:02:54,180 --> 00:02:55,710 +the forward pass of the model + +71 +00:02:55,710 --> 00:02:57,603 +since all that took no time at all. + +72 +00:02:58,440 --> 00:03:00,210 +The CPU stops moving forward, + +73 +00:03:00,210 --> 00:03:03,240 +because loss.backward as an +instruction telling it to wait + +74 +00:03:03,240 --> 00:03:04,830 +for the GPUs to be finished, + +75 +00:03:04,830 --> 00:03:06,780 +to make sure the gradients are correct. + +76 +00:03:07,650 --> 00:03:09,570 +When the GPU encounters an error, + +77 +00:03:09,570 --> 00:03:13,140 +it gives it back to the +CPU with a cryptic message + +78 +00:03:13,140 --> 00:03:15,423 +who raises the error at the wrong place. + +79 +00:03:16,350 --> 00:03:18,720 +So to debug this, we will +need to execute the next steps + +80 +00:03:18,720 --> 00:03:21,211 +of the training pipeline on the CPU. + +81 +00:03:21,211 --> 00:03:22,380 +It is very easy to do, + +82 +00:03:22,380 --> 00:03:25,350 +and we get a traceback +we can trust this time. + +83 +00:03:25,350 --> 00:03:26,520 +As we said before, + +84 +00:03:26,520 --> 00:03:28,620 +the error actually happens +during the forward pass + +85 +00:03:28,620 --> 00:03:29,453 +of the model, + +86 +00:03:29,453 --> 00:03:30,993 +and not loss.backward. + +87 +00:03:31,920 --> 00:03:33,680 +It's an index error. + +88 +00:03:33,680 --> 00:03:34,950 +With a bit of debugging, + +89 +00:03:34,950 --> 00:03:37,410 +we see we have labels ranging from 0 to 2, + +90 +00:03:37,410 --> 00:03:39,000 +so three different values, + +91 +00:03:39,000 --> 00:03:42,191 +but our outputs have a +shape of batch size per 2. + +92 +00:03:42,191 --> 00:03:45,600 +It looks like our model has +the wrong number of labels. + +93 +00:03:45,600 --> 00:03:47,190 +We can indeed confirm that, + +94 +00:03:47,190 --> 00:03:49,860 +and now that we know it's +easy to fix it in the code + +95 +00:03:49,860 --> 00:03:53,969 +by adding num_labels=3 +when we create the model. + +96 +00:03:53,969 --> 00:03:56,883 +Now the training script +will run to completion. + +97 +00:03:58,440 --> 00:03:59,430 +We did not need it yet, + +98 +00:03:59,430 --> 00:04:00,960 +but here is how we would +debug the next step + +99 +00:04:00,960 --> 00:04:02,944 +of the pipeline, gradient computation, + +100 +00:04:02,944 --> 00:04:05,850 +as well as the optimizer step. + +101 +00:04:05,850 --> 00:04:08,823 +With all of this, good luck +debugging your own trainings! + diff --git a/subtitles/en/74_debugging-the-training-pipeline-(tensorflow).srt b/subtitles/en/74_debugging-the-training-pipeline-(tensorflow).srt index c4cbfe579..aaac0136f 100644 --- a/subtitles/en/74_debugging-the-training-pipeline-(tensorflow).srt +++ b/subtitles/en/74_debugging-the-training-pipeline-(tensorflow).srt @@ -1,309 +1,799 @@ -1 -00:00:04,720 --> 00:00:09,280 -Some bugs in your code are very  -straightforward. You try running it,   - -2 -00:00:09,280 --> 00:00:14,720 -you get a syntax error somewhere, Python tells  -you exactly where, and you fix it. This is   - -3 -00:00:14,720 --> 00:00:22,240 -great - it's simple and satisfying. Sometimes,  -though, things crash and the error is impossible   - -4 -00:00:22,240 --> 00:00:27,360 -to understand. This happens a lot in machine  -learning for a few reasons - you're working with   - -5 -00:00:27,360 --> 00:00:33,920 -big data structures, using big, complex libraries  -with a lot of moving parts, and also you're doing   - -6 -00:00:33,920 --> 00:00:41,600 -a lot of GPU computing. In Keras there's the added  -bonus problem that your models are often compiled   - -7 -00:00:41,600 --> 00:00:46,080 -before execution, which is great for performance  -but makes debugging them very difficult.   - -8 -00:00:47,920 --> 00:00:52,160 -This is going to be a video about what to do  -when you run into one of those nightmare bugs.   - -9 -00:00:56,400 --> 00:01:07,600 -To give you some intuitions for what can go wrong,  -and where to look for the source of bugs that you   - -10 -00:01:07,600 --> 00:01:13,360 -encounter, let's use this example script, and  -I'll show it to you here in two parts. First,   - -11 -00:01:13,360 --> 00:01:19,280 -we do all our imports, we load a dataset, we  -create our tokenizer and we tokenize the dataset.   - -12 -00:01:20,160 --> 00:01:28,320 -Next, we convert our datasets to TensorFlow  -datasets, so that we can run fit() on them,   - -13 -00:01:28,320 --> 00:01:34,640 -and then we load our model from a pretrained  -checkpoint, compile it and fit it. It seems   - -14 -00:01:34,640 --> 00:01:42,880 -straightforward enough, but beware! This spooky  -code hides many dark and mysterious secrets.   - -15 -00:01:43,760 --> 00:01:52,880 -What happens when we run it? Well, this isn't  -great. What does that mean? We tried to train   - -16 -00:01:52,880 --> 00:01:59,600 -on our data, but we got no gradient? This is  -pretty perplexing - how do we even begin to debug   - -17 -00:02:00,400 --> 00:02:04,880 -something like that? When the error you get  -doesn't immediately suggest where the problem is,   - -18 -00:02:05,440 --> 00:02:11,040 -the best solution is often to walk through  -things in sequence, making sure at each stage   - -19 -00:02:11,040 --> 00:02:19,120 -that things look right. And of course, the  -place to start is always to check your data.   - -20 -00:02:20,720 --> 00:02:28,960 -The best way to do that to grab a batch from the  -tf.data.Dataset that your model is training on,   - -21 -00:02:30,560 --> 00:02:41,840 -right at the end of the training pipeline. And  -we can do that like so, by looping over the   - -22 -00:02:41,840 --> 00:02:50,320 -dataset for one iteration and then breaking.  -So what do we get when we inspect that batch?   - -23 -00:02:50,320 --> 00:02:54,800 -We see that we're not getting any gradient  -because we're not passing labels to Keras!   - -24 -00:02:55,520 --> 00:03:00,800 -Our labels are in the batch, but they're a key  -in the input dictionary, not a separate label.   - -25 -00:03:02,400 --> 00:03:06,160 -This is one of the most common issues you'll  -encounter when training Transformers models   - -26 -00:03:06,160 --> 00:03:12,960 -with TensorFlow. Our models can all compute loss  -internally, but to use that loss for training   - -27 -00:03:12,960 --> 00:03:16,880 -the labels need to be passed in the input  -dictionary, where the model can see them.   - -28 -00:03:17,760 --> 00:03:23,200 -This internal loss is the loss that we use when  -we don't specify a loss value to compile().   - -29 -00:03:26,640 --> 00:03:30,960 -Keras, on the other hand, usually expects  -labels to be passed separately from the input   - -30 -00:03:30,960 --> 00:03:36,720 -dictionary and not to be visible to the model,  -and loss computations will usually fail if you   - -31 -00:03:36,720 --> 00:03:43,040 -don't do that. We need to choose one or the other:  -Either we use the model's internal loss and keep   - -32 -00:03:43,040 --> 00:03:49,120 -the labels where they are, or we keep using Keras  -losses, but we move the labels to the place Keras   - -33 -00:03:49,120 --> 00:03:57,680 -expects them. For simplicity, let's use the model  -internal losses, by removing the loss argument   - -34 -00:03:57,680 --> 00:04:06,000 -from the call to compile(). So what happens if  -we try training with model.fit() after fixing   - -35 -00:04:06,560 --> 00:04:13,840 -the loss function! Well, it runs this time...  -but now we get a loss of nan. This isn't good.   - -36 -00:04:16,240 --> 00:04:22,160 -NaN is not a good loss. In fact, if we inspect  -our model now, we'll see that not only are all   - -37 -00:04:22,160 --> 00:04:30,640 -the outputs nan , all the weights are nan too.  -Once a single nan creeps into your computations,   - -38 -00:04:30,640 --> 00:04:37,280 -it tends to spread, because it propagates  -from the loss back through your gradient   - -39 -00:04:37,280 --> 00:04:48,320 -and then into the weight updates. So nan destroyed  -our model. But where did it creep in first?   - -40 -00:04:49,600 --> 00:04:57,760 -To find out, we need to re-initialize the model  -and look at the outputs for just the first batch.   - -41 -00:04:58,400 --> 00:05:04,160 -And when we do that, we see that nan first  -appears in the loss, but only in some samples!   - -42 -00:05:04,960 --> 00:05:08,480 -You can see this in more detail in the  -accompanying section of the course notes,   - -43 -00:05:11,040 --> 00:05:17,120 -but we find that if we look at the labels, the  -samples with a loss of nan all have a label of 2!   - -44 -00:05:17,760 --> 00:05:24,400 -This gives us a very strong clue - if we check  -the model, with model.config.num_labels, we see   - -45 -00:05:24,400 --> 00:05:30,080 -the model thinks there's only 2 labels, but if  -we see a value of 2, that means there's at least   - -46 -00:05:30,080 --> 00:05:36,400 -3 labels, because 0 is a label too! So we got  -a loss of nan because we got an "impossible"   - -47 -00:05:36,400 --> 00:05:43,040 -label. To fix that, we need to go back and set  -the model to have the right number of labels.   - -48 -00:05:43,680 --> 00:05:52,240 -We can set num_labels=3 when we initialize the  -model with from_pretrained. So now we think our   - -49 -00:05:52,240 --> 00:05:58,000 -data is good and our model is good, so training  -should work. And if we try running model.fit(),   - -50 -00:05:58,000 --> 00:06:07,600 -we get... hmm. The loss goes down, but it's  -not very quick. And if we keep running it out,   - -51 -00:06:07,600 --> 00:06:13,600 -we'll find that it stalls at a fairly high value.  -What's going on? Well, when things are mostly   - -52 -00:06:13,600 --> 00:06:19,280 -working, but training is just slow, that can  -often be a good time to look at your optimizer   - -53 -00:06:19,280 --> 00:06:24,560 -and training hyperparameters. And this is where  -I want to mention one of the most common sources   - -54 -00:06:24,560 --> 00:06:30,480 -of issues when you're working with Keras - you  -can name things like optimizers with strings,   - -55 -00:06:32,960 --> 00:06:37,680 -but if you do that, all of the options  -get silently set to their default values.   - -56 -00:06:38,240 --> 00:06:43,920 -So we specified our optimizer as Adam, but in  -the process we invisibly got the default learning   - -57 -00:06:43,920 --> 00:06:51,440 -rate, which is 1e-3, or ten to the power of minus  -3. This is way too high for training transformer   - -58 -00:06:51,440 --> 00:06:59,120 -models! We should go back and specify the learning  -rate directly - good values are between 1e-5   - -59 -00:06:59,760 --> 00:07:06,800 -and 1e-4. Let's split the difference and  -pick 5e-5. And if you recompile with that,   - -60 -00:07:06,800 --> 00:07:16,880 -you'll find that training actually works, at last.  -Again, I went through this quite quickly, and I   - -61 -00:07:16,880 --> 00:07:20,720 -recommend checking out the course notes for this  -to see this in more detail and to experiment with   - -62 -00:07:20,720 --> 00:07:43,840 -the code yourself. Good luck, and remember to take  -breaks if your code is giving you a hard time! +1 +00:00:00,212 --> 00:00:02,879 +(air whooshing) + +2 +00:00:04,680 --> 00:00:08,130 +- Some bugs in your code +are very straightforward. + +3 +00:00:08,130 --> 00:00:11,580 +You try running it, you get +a syntax error somewhere, + +4 +00:00:11,580 --> 00:00:14,490 +Python tells you exactly +where, and you fix it. + +5 +00:00:14,490 --> 00:00:17,760 +This is great, it's simple +and it's satisfying. + +6 +00:00:17,760 --> 00:00:20,310 +Sometimes, though, things crash + +7 +00:00:20,310 --> 00:00:23,670 +and the error is impossible to understand. + +8 +00:00:23,670 --> 00:00:26,700 +This happens a lot in machine +learning for a few reasons, + +9 +00:00:26,700 --> 00:00:29,310 +you're working with big data structures, + +10 +00:00:29,310 --> 00:00:31,440 +you're using these big, complex libraries + +11 +00:00:31,440 --> 00:00:33,420 +with a lot of moving parts, + +12 +00:00:33,420 --> 00:00:35,310 +and also you're doing +a lot of GPU computing, + +13 +00:00:35,310 --> 00:00:38,490 +and that in general is much +more difficult to debug. + +14 +00:00:38,490 --> 00:00:40,260 +In Keras there's the additional problem + +15 +00:00:40,260 --> 00:00:43,140 +that your models are often +compiled before execution, + +16 +00:00:43,140 --> 00:00:44,400 +which is great for performance + +17 +00:00:44,400 --> 00:00:47,430 +but it makes debugging them +very difficult as well. + +18 +00:00:47,430 --> 00:00:50,370 +So, this is going to be +a video about what to do + +19 +00:00:50,370 --> 00:00:52,410 +when you run into one +of those nightmare bugs + +20 +00:00:52,410 --> 00:00:55,210 +and you just have no idea +where to begin with fixing it. + +21 +00:00:56,370 --> 00:00:58,920 +So, to give you some intuitions for + +22 +00:00:58,920 --> 00:01:01,530 +the most common things that go wrong + +23 +00:01:01,530 --> 00:01:03,573 +and cause these weird issues, + +24 +00:01:04,800 --> 00:01:07,530 +and show you where to look +for the sources of bugs + +25 +00:01:07,530 --> 00:01:10,560 +that you encounter, let's +use this example script. + +26 +00:01:10,560 --> 00:01:12,900 +So, I'll show it to you here in two parts. + +27 +00:01:12,900 --> 00:01:16,410 +First, we do all our +imports, we load a dataset, + +28 +00:01:16,410 --> 00:01:20,280 +we create our tokenizer and +we tokenize the dataset. + +29 +00:01:20,280 --> 00:01:23,640 +Next, we convert our datasets +to TensorFlow datasets, + +30 +00:01:23,640 --> 00:01:26,100 +so that's tf.data.Dataset, + +31 +00:01:26,100 --> 00:01:28,500 +and that's so that we can run fit on them, + +32 +00:01:28,500 --> 00:01:31,170 +and then we load our model +from a pretrained checkpoint, + +33 +00:01:31,170 --> 00:01:33,870 +we compile it and we fit +it with those datasets. + +34 +00:01:33,870 --> 00:01:35,970 +So, this seems straightforward enough, + +35 +00:01:35,970 --> 00:01:38,220 +it's similar to what we've +done in the course before. + +36 +00:01:38,220 --> 00:01:40,650 +But beware, this is spooky code + +37 +00:01:40,650 --> 00:01:43,590 +and hides many dark +and mysterious secrets. + +38 +00:01:43,590 --> 00:01:46,050 +So, what happens when we run it? + +39 +00:01:46,050 --> 00:01:48,840 +Well, it's not great. + +40 +00:01:48,840 --> 00:01:52,320 +So, we get this error message, +but what does it mean? + +41 +00:01:52,320 --> 00:01:55,470 +We tried to train on our +data, but we got no gradient? + +42 +00:01:55,470 --> 00:01:59,130 +It's pretty perplexing, I +mean, how do we even begin + +43 +00:01:59,130 --> 00:02:01,500 +to debug not getting a gradient? + +44 +00:02:01,500 --> 00:02:03,930 +So, when the error you get +doesn't immediately suggest + +45 +00:02:03,930 --> 00:02:06,630 +where the problem is, the best solution + +46 +00:02:06,630 --> 00:02:09,180 +is often to walk through +things in sequence, + +47 +00:02:09,180 --> 00:02:12,900 +making sure at each stage +that the outputs look right, + +48 +00:02:12,900 --> 00:02:15,300 +that everything looks okay at that point. + +49 +00:02:15,300 --> 00:02:17,730 +And, of course, that +means the place to start + +50 +00:02:17,730 --> 00:02:19,473 +is always to check your data. + +51 +00:02:20,670 --> 00:02:22,050 +So, the best way to make sure + +52 +00:02:22,050 --> 00:02:24,480 +that the data you're +giving the model is good, + +53 +00:02:24,480 --> 00:02:27,690 +is to grab a batch from +the tf.data.Dataset + +54 +00:02:27,690 --> 00:02:29,520 +that your model is training on, + +55 +00:02:29,520 --> 00:02:31,560 +and that's because it's right at the end + +56 +00:02:31,560 --> 00:02:33,990 +of the data pipeline. + +57 +00:02:33,990 --> 00:02:36,990 +And so that means that if +those outputs are good, + +58 +00:02:36,990 --> 00:02:39,990 +you're guaranteed that your +data pipeline is working well. + +59 +00:02:39,990 --> 00:02:42,600 +So, we can do that by +looping over the dataset + +60 +00:02:42,600 --> 00:02:44,790 +for one iteration and then breaking, + +61 +00:02:44,790 --> 00:02:46,980 +and that gives us a single batch. + +62 +00:02:46,980 --> 00:02:49,443 +So, what do we get when +we inspect that batch? + +63 +00:02:50,460 --> 00:02:52,500 +We'll see that we're +not getting any gradient + +64 +00:02:52,500 --> 00:02:55,530 +because we're not passing labels to Keras. + +65 +00:02:55,530 --> 00:02:57,510 +So, our labels are in the batch, + +66 +00:02:57,510 --> 00:02:59,670 +but they're a key in the input dictionary + +67 +00:02:59,670 --> 00:03:02,340 +and they're not a separate +label as Keras expects, + +68 +00:03:02,340 --> 00:03:04,830 +so this is one of the most +common issues you'll encounter + +69 +00:03:04,830 --> 00:03:07,590 +when training Transformers +models with TensorFlow. + +70 +00:03:07,590 --> 00:03:10,980 +Our models can all +compute loss internally, + +71 +00:03:10,980 --> 00:03:13,140 +but to use that loss for training + +72 +00:03:13,140 --> 00:03:15,960 +the labels need to be passed +in the input dictionary, + +73 +00:03:15,960 --> 00:03:17,940 +where the model can see them. + +74 +00:03:17,940 --> 00:03:20,280 +This internal loss is the loss that we use + +75 +00:03:20,280 --> 00:03:23,760 +when we don't specify a +loss when we call compile, + +76 +00:03:23,760 --> 00:03:25,660 +when we don't specify a loss argument. + +77 +00:03:26,520 --> 00:03:27,870 +So, Keras, on the other hand, + +78 +00:03:27,870 --> 00:03:30,570 +usually expects labels +to be passed separately + +79 +00:03:30,570 --> 00:03:32,130 +from the input dictionary, + +80 +00:03:32,130 --> 00:03:34,110 +and not to be visible to the model, + +81 +00:03:34,110 --> 00:03:36,600 +and loss computations will usually fail + +82 +00:03:36,600 --> 00:03:38,220 +if you don't do that + +83 +00:03:38,220 --> 00:03:40,380 +So we need to choose one or the other, + +84 +00:03:40,380 --> 00:03:42,780 +either we use the model's internal loss + +85 +00:03:42,780 --> 00:03:44,940 +and keep the labels where they are, + +86 +00:03:44,940 --> 00:03:46,980 +or we keep using Keras losses + +87 +00:03:46,980 --> 00:03:50,520 +but we move the labels to +the place Keras expects them. + +88 +00:03:50,520 --> 00:03:53,310 +So, or simplicity here, +let's fix this issue + +89 +00:03:53,310 --> 00:03:55,860 +by using the model's internal losses, + +90 +00:03:55,860 --> 00:03:57,900 +and we do that by +removing the loss argument + +91 +00:03:57,900 --> 00:03:59,343 +from the call to compile. + +92 +00:04:00,540 --> 00:04:03,000 +So, what happens if we try training now? + +93 +00:04:03,000 --> 00:04:08,000 +So we recompile with that, we +call model.fit, what happens? + +94 +00:04:08,220 --> 00:04:13,050 +Well, it runs this time but +now we get a loss of NaN. + +95 +00:04:13,050 --> 00:04:16,440 +So, that's not good, +NaN means not a number + +96 +00:04:16,440 --> 00:04:19,140 +and it's not a good +loss to have in general. + +97 +00:04:19,140 --> 00:04:21,000 +In fact, if we inspect our model now, + +98 +00:04:21,000 --> 00:04:23,970 +we'll see that not only +are all the outputs NaN, + +99 +00:04:23,970 --> 00:04:27,600 +all the weights are NaN as +well, as well as the loss. + +100 +00:04:27,600 --> 00:04:30,810 +So once a single NaN creeps +into your computations, + +101 +00:04:30,810 --> 00:04:34,530 +it tends to spread, because +it propagates from the loss + +102 +00:04:34,530 --> 00:04:36,420 +and once it's at the loss +it's at the gradient, + +103 +00:04:36,420 --> 00:04:37,530 +it gets to the gradient, + +104 +00:04:37,530 --> 00:04:38,910 +and then once it's in the gradient + +105 +00:04:38,910 --> 00:04:41,280 +it enters the weight updates, + +106 +00:04:41,280 --> 00:04:43,980 +and then all your weight +updates end up as NaN as well. + +107 +00:04:43,980 --> 00:04:46,950 +So NaN just completely +destroyed our model here, + +108 +00:04:46,950 --> 00:04:49,560 +but where did it creep in first? + +109 +00:04:49,560 --> 00:04:52,140 +So to find out, we need to back to a point + +110 +00:04:52,140 --> 00:04:53,490 +before the model was destroyed, + +111 +00:04:53,490 --> 00:04:55,440 +we need to re-initialize the model + +112 +00:04:55,440 --> 00:04:58,590 +and look at the outputs +for just the first batch. + +113 +00:04:58,590 --> 00:04:59,850 +And when we do that, + +114 +00:04:59,850 --> 00:05:02,790 +we see that NaN first appears in the loss, + +115 +00:05:02,790 --> 00:05:04,980 +but only in some samples. + +116 +00:05:04,980 --> 00:05:06,540 +So you can see this in more detail + +117 +00:05:06,540 --> 00:05:09,090 +in the accompanying section +of the course notes, + +118 +00:05:09,090 --> 00:05:11,220 +I am moving fairly quickly here, + +119 +00:05:11,220 --> 00:05:13,500 +but we find that if we look at the labels, + +120 +00:05:13,500 --> 00:05:17,790 +the samples with a loss of +NaN all have a label of two. + +121 +00:05:17,790 --> 00:05:19,950 +So this gives us a very strong clue, + +122 +00:05:19,950 --> 00:05:24,060 +if we check the model with +model.config.num_labels, + +123 +00:05:24,060 --> 00:05:26,760 +we see that the model thinks +there's only two labels, + +124 +00:05:26,760 --> 00:05:28,950 +but if we see a value of two, + +125 +00:05:28,950 --> 00:05:31,200 +that means there's at least three labels + +126 +00:05:31,200 --> 00:05:33,630 +because 0 is a label as well. + +127 +00:05:33,630 --> 00:05:35,070 +So we got a loss of NaN + +128 +00:05:35,070 --> 00:05:37,887 +because we got an "impossible" +label in our label set, + +129 +00:05:37,887 --> 00:05:41,010 +and to fix that we need to +go back and set the model + +130 +00:05:41,010 --> 00:05:43,650 +to expect the right number of labels, + +131 +00:05:43,650 --> 00:05:45,870 +so we can set num_labels=3 + +132 +00:05:45,870 --> 00:05:48,540 +when we initialize the +model but from_pretrained, + +133 +00:05:48,540 --> 00:05:51,450 +and now hopefully we can avoid this issue. + +134 +00:05:51,450 --> 00:05:54,660 +So, now we think our data is +good and our model is good + +135 +00:05:54,660 --> 00:05:56,220 +and so training should work + +136 +00:05:56,220 --> 00:06:00,510 +but if we try running +model.fit, we, well... + +137 +00:06:00,510 --> 00:06:02,040 +I mean, we do get a loss, + +138 +00:06:02,040 --> 00:06:03,930 +it is a number and it is going down + +139 +00:06:03,930 --> 00:06:06,090 +but it's not going down very quickly + +140 +00:06:06,090 --> 00:06:07,770 +and if we keep running this out, + +141 +00:06:07,770 --> 00:06:10,980 +we'll find that it stalls +at a fairly high loss value. + +142 +00:06:10,980 --> 00:06:12,450 +So, what's going on? + +143 +00:06:12,450 --> 00:06:14,130 +Well, when things are mostly working, + +144 +00:06:14,130 --> 00:06:16,620 +but training is just slow or a bit odd, + +145 +00:06:16,620 --> 00:06:19,470 +that can often be a good time +to look at your optimizer + +146 +00:06:19,470 --> 00:06:22,020 +and your training hyperparameters. + +147 +00:06:22,020 --> 00:06:23,460 +And this is where I want to mention + +148 +00:06:23,460 --> 00:06:25,320 +one of the most common sources of issues + +149 +00:06:25,320 --> 00:06:27,000 +when you're working with Keras, + +150 +00:06:27,000 --> 00:06:30,870 +you can name things like +optimizers with strings, + +151 +00:06:30,870 --> 00:06:33,180 +so Keras supports that +and it's very convenient, + +152 +00:06:33,180 --> 00:06:35,460 +but if you do that all of the options + +153 +00:06:35,460 --> 00:06:38,400 +get silently set to their default values. + +154 +00:06:38,400 --> 00:06:41,190 +So we specified our optimizer as Adam, + +155 +00:06:41,190 --> 00:06:43,110 +but in the process we invisibly got + +156 +00:06:43,110 --> 00:06:46,260 +the default learning rate, which is 1e-3, + +157 +00:06:46,260 --> 00:06:48,630 +or 10 to the power of -3. + +158 +00:06:48,630 --> 00:06:50,550 +So this learning rate is way too high + +159 +00:06:50,550 --> 00:06:52,530 +for training transformer models, + +160 +00:06:52,530 --> 00:06:55,620 +we should go back and specify +the learning rate directly, + +161 +00:06:55,620 --> 00:06:57,060 +not using a string. + +162 +00:06:57,060 --> 00:07:01,290 +So, good values here are +between 1e-5 and 1e-4 + +163 +00:07:01,290 --> 00:07:04,233 +so let's split the +difference and pick 5e-5. + +164 +00:07:05,310 --> 00:07:06,990 +So if you recompile with that, + +165 +00:07:06,990 --> 00:07:09,840 +you'll find that training +actually works, at last. + +166 +00:07:09,840 --> 00:07:11,700 +The loss goes down efficiently + +167 +00:07:11,700 --> 00:07:14,070 +and it converges to a lower value. + +168 +00:07:14,070 --> 00:07:16,410 +So, again, I did go +through this quite quickly + +169 +00:07:16,410 --> 00:07:18,720 +and I strongly recommend +checking out the course notes + +170 +00:07:18,720 --> 00:07:20,040 +to see this in more detail, + +171 +00:07:20,040 --> 00:07:21,600 +and to experiment with the code yourself + +172 +00:07:21,600 --> 00:07:23,490 +and see what the errors look like + +173 +00:07:23,490 --> 00:07:25,380 +and how you can approach them, + +174 +00:07:25,380 --> 00:07:27,930 +but I hope I've given +you here a quick summary + +175 +00:07:27,930 --> 00:07:30,510 +of the most common bugs + +176 +00:07:30,510 --> 00:07:32,880 +and maybe the most common +debugging approaches + +177 +00:07:32,880 --> 00:07:33,960 +to dealing with them. + +178 +00:07:33,960 --> 00:07:37,020 +So, good luck, and remember +to take plenty of breaks + +179 +00:07:37,020 --> 00:07:38,970 +if your code is giving you a hard time. + +180 +00:07:39,805 --> 00:07:42,472 +(air whooshing) + diff --git a/subtitles/en/75_writing-a-good-issue.srt b/subtitles/en/75_writing-a-good-issue.srt index 9c21d200b..5a03d5820 100644 --- a/subtitles/en/75_writing-a-good-issue.srt +++ b/subtitles/en/75_writing-a-good-issue.srt @@ -1,164 +1,330 @@ -1 -00:00:05,440 --> 00:00:11,040 -How to write a good issue on GitHub? GitHub  -is the main place for the Hugging Face open   - -2 -00:00:11,040 --> 00:00:15,920 -source libraries, and should always go there  -to report a bug or ask for a new feature. For   - -3 -00:00:15,920 --> 00:00:21,680 -more general questions or to debug your own code,  -use the forums (see the video linked below). It's   - -4 -00:00:21,680 --> 00:00:25,920 -very important to write good issues as it will  -help the bug you uncovered be fixed in no time.   - -5 -00:00:26,960 --> 00:00:31,760 -For this video, we have created a version of  -Transformers with a bug. You can install it by   - -6 -00:00:31,760 --> 00:00:36,080 -executing this command in a notebook (remove the  -exclamation mark to execute it in a terminal).   - -7 -00:00:37,040 --> 00:00:43,440 -In this version, the following example fails. The  -error is rather cryptic and does not seem to come   - -8 -00:00:43,440 --> 00:00:49,600 -from anything in our code, so it seems we have a  -bug to report! The first thing to do in this case   - -9 -00:00:49,600 --> 00:00:53,200 -is to try to find the smallest amount of  -code possible that reproduces the bug.   - -10 -00:00:54,000 --> 00:00:59,680 -In our case, inspecting the traceback, we see the  -failure happens inside the pipeline function when   - -11 -00:00:59,680 --> 00:01:06,400 -it calls AutoTokenizer.from_pretrained. Using the  -debugger, we find the values passed to that method   - -12 -00:01:06,400 --> 00:01:11,840 -and can thus create a small sample of code  -that hopefully generates the same error.   - -13 -00:01:12,560 --> 00:01:16,800 -It's very important to go though this step as you  -may realize the error was on your side and not   - -14 -00:01:16,800 --> 00:01:21,280 -a bug in the library, but it also will make it  -easier for the maintainers to fix your problem.   - -15 -00:01:22,080 --> 00:01:24,880 -Here we can play around a bit more with  -this code and notice the error happens   - -16 -00:01:24,880 --> 00:01:31,040 -for different checkpoints and not just this one,  -and that it disappears when we use use_fast=False   - -17 -00:01:31,040 --> 00:01:36,400 -inside our tokenizer call. The important part  -is to have something that does not depend on any   - -18 -00:01:36,400 --> 00:01:42,800 -external files or data. Try to replace your data  -by fake values if you can't share it. With all   - -19 -00:01:42,800 --> 00:01:48,480 -of this done, we are ready to start writing our  -issue. Click on the button next to Bug Report and   - -20 -00:01:48,480 --> 00:01:54,160 -you will discover there is a template to fill. It  -will only take you a couple of minutes. The first   - -21 -00:01:54,160 --> 00:02:00,000 -thing is to properly name your issue. Don't pick  -a title that is too vague! Then you have to fill   - -22 -00:02:00,000 --> 00:02:04,800 -your environment information. There is a command  -provided by the Transformers library to do this.   - -23 -00:02:05,520 --> 00:02:09,840 -Just execute it in your notebook or in  -a terminal, and copy paste the results.   - -24 -00:02:10,800 --> 00:02:15,600 -There are two last questions to fill manually  -(to which the answers are no and no in our case).   - -25 -00:02:17,440 --> 00:02:23,680 -Next, we need to determine who to tag. There is  -a full list of usernames. Since our issue has   - -26 -00:02:23,680 --> 00:02:28,880 -to do with tokenizers, we pick the maintainer  -associated with them. There is no point tagging   - -27 -00:02:28,880 --> 00:02:32,640 -more than 3 people, they will redirect you  -to the right person if you made a mistake.   - -28 -00:02:34,320 --> 00:02:37,280 -Next, we have to give the information  -necessary to reproduce the bug.   - -29 -00:02:38,000 --> 00:02:43,280 -We paste our sample, and put it between two  -lines with three backticks so it's formatted   - -30 -00:02:43,280 --> 00:02:49,840 -properly. We also paste the full traceback, still  -between two lines of three backticks. Lastly,   - -31 -00:02:50,400 --> 00:02:54,400 -we can add any additional information about  -what we tried to debug the issue at hand.   - -32 -00:02:54,960 --> 00:02:58,800 -With all of this, you should expect an answer to  -your issue pretty fast, and hopefully, a quick   - -33 -00:02:58,800 --> 00:03:03,840 -fix! Note that all the advise in this video  -applies for almost every open-source project. +1 +00:00:05,610 --> 00:00:08,557 +- How to write a good issue on GitHub? + +2 +00:00:08,557 --> 00:00:10,080 +GitHub is the main place + +3 +00:00:10,080 --> 00:00:12,000 +for the Hugging Face +open source libraries, + +4 +00:00:12,000 --> 00:00:14,010 +and you should always +go there to report a bug + +5 +00:00:14,010 --> 00:00:16,020 +or ask for a new feature. + +6 +00:00:16,020 --> 00:00:18,660 +For more general questions +or to debug your own code + +7 +00:00:18,660 --> 00:00:21,707 +use the forums, see +the video linked below. + +8 +00:00:21,707 --> 00:00:23,677 +It's very important to write good issues + +9 +00:00:23,677 --> 00:00:27,232 +as it will help the bug you +uncovered be fixed in no time. + +10 +00:00:27,232 --> 00:00:29,750 +For this video, we have created +a version of Transformers + +11 +00:00:29,750 --> 00:00:31,066 +with a bug. + +12 +00:00:31,066 --> 00:00:33,783 +You can install it by executing +this command in a notebook, + +13 +00:00:33,783 --> 00:00:37,239 +remove the exclamation mark +to execute it in a terminal. + +14 +00:00:37,239 --> 00:00:41,016 +In this version, the +following example fails. + +15 +00:00:41,016 --> 00:00:42,472 +The error is rather cryptic + +16 +00:00:42,472 --> 00:00:45,184 +and does not seem to come +from anything in our code, + +17 +00:00:45,184 --> 00:00:48,157 +so it seems we have a bug to report. + +18 +00:00:48,157 --> 00:00:49,858 +The first thing to do in this case + +19 +00:00:49,858 --> 00:00:52,053 +is to try to find the smallest +amount of code possible + +20 +00:00:52,053 --> 00:00:54,059 +that reproduces the bug. + +21 +00:00:54,059 --> 00:00:56,802 +In our case, inspecting the traceback, + +22 +00:00:56,802 --> 00:00:59,645 +we see the failure happens +inside the pipeline function + +23 +00:00:59,645 --> 00:01:03,158 +when it calls +AutoTokenizer.from_pretrained. + +24 +00:01:03,158 --> 00:01:06,609 +Using the debugger, we find the +values passed to that method + +25 +00:01:06,609 --> 00:01:08,849 +and can thus create a small sample of code + +26 +00:01:08,849 --> 00:01:12,802 +that hopefully generates the same error. + +27 +00:01:12,802 --> 00:01:14,726 +It's very important to go though this step + +28 +00:01:14,726 --> 00:01:16,770 +as you may realize the +error was on your side + +29 +00:01:16,770 --> 00:01:18,360 +and not a bug in the library, + +30 +00:01:18,360 --> 00:01:20,610 +but it also will make it +easier for the maintainers + +31 +00:01:20,610 --> 00:01:22,320 +to fix your problem. + +32 +00:01:22,320 --> 00:01:24,030 +Here we can play around +a bit more with this code + +33 +00:01:24,030 --> 00:01:26,460 +and notice the error happens +for different checkpoints + +34 +00:01:26,460 --> 00:01:28,050 +and not just this one, + +35 +00:01:28,050 --> 00:01:31,262 +and that it disappears +when we use use_fast=False + +36 +00:01:31,262 --> 00:01:33,240 +inside our tokenizer call. + +37 +00:01:33,240 --> 00:01:35,190 +The important part is to have something + +38 +00:01:35,190 --> 00:01:38,640 +that does not depend on +any external files or data. + +39 +00:01:38,640 --> 00:01:40,350 +Try to replace your data by fake values + +40 +00:01:40,350 --> 00:01:41,450 +if you can't share it. + +41 +00:01:42,750 --> 00:01:43,620 +With all of this done, + +42 +00:01:43,620 --> 00:01:46,260 +we are ready to start writing our issue. + +43 +00:01:46,260 --> 00:01:48,600 +Click on the button next to Bug Report + +44 +00:01:48,600 --> 00:01:51,300 +and you will discover that +there is a template to fill. + +45 +00:01:51,300 --> 00:01:53,940 +It will only take you a couple of minutes. + +46 +00:01:53,940 --> 00:01:56,460 +The first thing is to +properly name your issue. + +47 +00:01:56,460 --> 00:01:59,100 +Don't pick a title that is too vague. + +48 +00:01:59,100 --> 00:02:02,160 +Then you have to fill your +environment information. + +49 +00:02:02,160 --> 00:02:03,330 +There is a command provided + +50 +00:02:03,330 --> 00:02:05,700 +by the Transformers library to do this. + +51 +00:02:05,700 --> 00:02:08,550 +Just execute it in your +notebook or in a terminal + +52 +00:02:08,550 --> 00:02:10,203 +and copy paste the results. + +53 +00:02:11,070 --> 00:02:13,530 +There are two last +questions to fill manually, + +54 +00:02:13,530 --> 00:02:16,023 +to which the answers are +no and no in our case. + +55 +00:02:17,550 --> 00:02:20,460 +Next, we need to determine who to tag. + +56 +00:02:20,460 --> 00:02:23,010 +There is a full list of +usernames in the template. + +57 +00:02:23,010 --> 00:02:25,043 +Since our issue has to do with tokenizers, + +58 +00:02:25,043 --> 00:02:28,170 +we pick the maintainer +associated with them. + +59 +00:02:28,170 --> 00:02:30,210 +There is no point tagging +more than 3 people, + +60 +00:02:30,210 --> 00:02:32,010 +they will redirect you to the right person + +61 +00:02:32,010 --> 00:02:33,110 +if you made a mistake. + +62 +00:02:34,410 --> 00:02:36,600 +Next, we have to give +the information necessary + +63 +00:02:36,600 --> 00:02:38,220 +to reproduce the bug. + +64 +00:02:38,220 --> 00:02:41,010 +We paste our sample, and +put it between two lines + +65 +00:02:41,010 --> 00:02:44,073 +with three backticks so that +it's formatted properly. + +66 +00:02:45,210 --> 00:02:47,010 +We also paste the full traceback, + +67 +00:02:47,010 --> 00:02:49,740 +still between two lines +of three backticks. + +68 +00:02:49,740 --> 00:02:52,650 +Lastly, we can add any +additional information + +69 +00:02:52,650 --> 00:02:55,200 +about what we tried to +debug the issue at hand. + +70 +00:02:55,200 --> 00:02:57,030 +With all of this, you +should expect an answer + +71 +00:02:57,030 --> 00:02:59,880 +to your issue pretty fast +and hopefully a quick fix. + +72 +00:02:59,880 --> 00:03:01,770 +Note that all the advise in this video + +73 +00:03:01,770 --> 00:03:04,203 +applies for almost every +open-source project. +