From 60edea6d56978eae1813f456bf3c7bdcd8eea7de Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 7 Sep 2022 10:32:35 -0700 Subject: [PATCH 1/4] skip some code examples for doctests --- docs/source/en/quicktour.mdx | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx index f1b3ca5bf0f688..c81b037385b804 100644 --- a/docs/source/en/quicktour.mdx +++ b/docs/source/en/quicktour.mdx @@ -435,8 +435,8 @@ Depending on your task, you'll typically pass the following parameters to [`Trai 4. Your preprocessed train and test datasets: ```py - >>> train_dataset = dataset["train"] - >>> eval_dataset = dataset["eval"] + >>> train_dataset = dataset["train"] # doctest: +SKIP + >>> eval_dataset = dataset["eval"] # doctest: +SKIP ``` 5. A [`DataCollator`] to create a batch of examples from your dataset: @@ -459,13 +459,13 @@ Now gather all these classes in [`Trainer`]: ... eval_dataset=dataset["test"], ... tokenizer=tokenizer, ... data_collator=data_collator, -... ) +... ) # doctest: +SKIP ``` When you're ready, call [`~Trainer.train`] to start training: ```py ->>> trainer.train() +>>> trainer.train() # doctest: +SKIP ``` @@ -502,11 +502,9 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs ```py >>> def tokenize_dataset(dataset): - ... return tokenizer(dataset["text"]) - - - >>> dataset = dataset.map(tokenize_dataset) - >>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer) + ... return tokenizer(dataset["text"]) # doctest: +SKIP + >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP + >>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer) # doctest: +SKIP ``` 4. When you're ready, you can call `compile` and `fit` to start training: @@ -515,7 +513,7 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs >>> from tensorflow.keras.optimizers import Adam >>> model.compile(optimizer=Adam(3e-5)) - >>> model.fit(dataset) + >>> model.fit(dataset) # doctest: +SKIP ``` ## What's next? From b6e5e74560f1835860a1083eb02c48bcd32a2b77 Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 7 Sep 2022 10:35:43 -0700 Subject: [PATCH 2/4] make style --- docs/source/en/quicktour.mdx | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx index c81b037385b804..a536ab304e878b 100644 --- a/docs/source/en/quicktour.mdx +++ b/docs/source/en/quicktour.mdx @@ -435,8 +435,8 @@ Depending on your task, you'll typically pass the following parameters to [`Trai 4. Your preprocessed train and test datasets: ```py - >>> train_dataset = dataset["train"] # doctest: +SKIP - >>> eval_dataset = dataset["eval"] # doctest: +SKIP + >>> train_dataset = dataset["train"] # doctest: +SKIP + >>> eval_dataset = dataset["eval"] # doctest: +SKIP ``` 5. A [`DataCollator`] to create a batch of examples from your dataset: @@ -459,13 +459,13 @@ Now gather all these classes in [`Trainer`]: ... eval_dataset=dataset["test"], ... tokenizer=tokenizer, ... data_collator=data_collator, -... ) # doctest: +SKIP +... ) # doctest: +SKIP ``` When you're ready, call [`~Trainer.train`] to start training: ```py ->>> trainer.train() # doctest: +SKIP +>>> trainer.train() # doctest: +SKIP ``` @@ -502,9 +502,13 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs ```py >>> def tokenize_dataset(dataset): - ... return tokenizer(dataset["text"]) # doctest: +SKIP - >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP - >>> tf_dataset = model.prepare_tf_dataset(dataset, batch_size=16, shuffle=True, tokenizer=tokenizer) # doctest: +SKIP + ... return tokenizer(dataset["text"]) # doctest: +SKIP + + + >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP + >>> tf_dataset = model.prepare_tf_dataset( + ... dataset, batch_size=16, shuffle=True, tokenizer=tokenizer + ... ) # doctest: +SKIP ``` 4. When you're ready, you can call `compile` and `fit` to start training: @@ -513,7 +517,7 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs >>> from tensorflow.keras.optimizers import Adam >>> model.compile(optimizer=Adam(3e-5)) - >>> model.fit(dataset) # doctest: +SKIP + >>> model.fit(dataset) # doctest: +SKIP ``` ## What's next? From 54ec4dddc66334b17035efc40cd6d8ea75c83880 Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 7 Sep 2022 11:09:28 -0700 Subject: [PATCH 3/4] fix code snippet formatting --- docs/source/en/quicktour.mdx | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx index a536ab304e878b..3a61723bf2d432 100644 --- a/docs/source/en/quicktour.mdx +++ b/docs/source/en/quicktour.mdx @@ -503,8 +503,6 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs ```py >>> def tokenize_dataset(dataset): ... return tokenizer(dataset["text"]) # doctest: +SKIP - - >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP >>> tf_dataset = model.prepare_tf_dataset( ... dataset, batch_size=16, shuffle=True, tokenizer=tokenizer From cb628f77cb15ed6cdac1670e03df5c54507f267e Mon Sep 17 00:00:00 2001 From: Steven Date: Wed, 7 Sep 2022 12:23:30 -0700 Subject: [PATCH 4/4] separate code snippet into two blocks --- docs/source/en/quicktour.mdx | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx index 3a61723bf2d432..3fcdb4fff22457 100644 --- a/docs/source/en/quicktour.mdx +++ b/docs/source/en/quicktour.mdx @@ -498,18 +498,23 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased") ``` -3. Tokenize the dataset and pass it and the tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`]. You can also change the batch size and shuffle the dataset here if you'd like: +3. Create a function to tokenize the dataset: ```py >>> def tokenize_dataset(dataset): ... return tokenizer(dataset["text"]) # doctest: +SKIP + ``` + +4. Apply the tokenizer over the entire dataset with [`~datasets.Dataset.map`] and then pass the dataset and tokenizer to [`~TFPreTrainedModel.prepare_tf_dataset`]. You can also change the batch size and shuffle the dataset here if you'd like: + + ```py >>> dataset = dataset.map(tokenize_dataset) # doctest: +SKIP >>> tf_dataset = model.prepare_tf_dataset( ... dataset, batch_size=16, shuffle=True, tokenizer=tokenizer ... ) # doctest: +SKIP ``` -4. When you're ready, you can call `compile` and `fit` to start training: +5. When you're ready, you can call `compile` and `fit` to start training: ```py >>> from tensorflow.keras.optimizers import Adam