Skip to content

Commit a35f889

Browse files
authored
Tests: detect lines removed from "utils/not_doctested.txt" and doctest ALL generation files (#25763)
1 parent 483861d commit a35f889

File tree

4 files changed

+67
-51
lines changed

4 files changed

+67
-51
lines changed

docs/source/en/generation_strategies.md

Lines changed: 37 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ When you load a model explicitly, you can inspect the generation configuration t
5555
>>> from transformers import AutoModelForCausalLM
5656

5757
>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
58-
>>> model.generation_config
58+
>>> model.generation_config # doctest: +IGNORE_RESULT
5959
GenerationConfig {
6060
"_from_model_config": true,
6161
"bos_token_id": 50256,
@@ -77,7 +77,7 @@ producing highly repetitive results.
7777
You can override any `generation_config` by passing the parameters and their values directly to the [`generate`] method:
7878

7979
```python
80-
>>> my_model.generate(**inputs, num_beams=4, do_sample=True)
80+
>>> my_model.generate(**inputs, num_beams=4, do_sample=True) # doctest: +SKIP
8181
```
8282

8383
Even if the default decoding strategy mostly works for your task, you can still tweak a few things. Some of the
@@ -107,11 +107,11 @@ If you would like to share your fine-tuned model with a specific generation conf
107107
```python
108108
>>> from transformers import AutoModelForCausalLM, GenerationConfig
109109

110-
>>> model = AutoModelForCausalLM.from_pretrained("my_account/my_model")
110+
>>> model = AutoModelForCausalLM.from_pretrained("my_account/my_model") # doctest: +SKIP
111111
>>> generation_config = GenerationConfig(
112112
... max_new_tokens=50, do_sample=True, top_k=50, eos_token_id=model.config.eos_token_id
113113
... )
114-
>>> generation_config.save_pretrained("my_account/my_model", push_to_hub=True)
114+
>>> generation_config.save_pretrained("my_account/my_model", push_to_hub=True) # doctest: +SKIP
115115
```
116116

117117
You can also store several generation configurations in a single directory, making use of the `config_file_name`
@@ -133,14 +133,15 @@ one for summarization with beam search). You must have the right Hub permissions
133133
... pad_token=model.config.pad_token_id,
134134
... )
135135

136-
>>> translation_generation_config.save_pretrained("t5-small", "translation_generation_config.json", push_to_hub=True)
136+
>>> # Tip: add `push_to_hub=True` to push to the Hub
137+
>>> translation_generation_config.save_pretrained("/tmp", "translation_generation_config.json")
137138

138139
>>> # You could then use the named generation config file to parameterize generation
139-
>>> generation_config = GenerationConfig.from_pretrained("t5-small", "translation_generation_config.json")
140+
>>> generation_config = GenerationConfig.from_pretrained("/tmp", "translation_generation_config.json")
140141
>>> inputs = tokenizer("translate English to French: Configuration files are easy to use!", return_tensors="pt")
141142
>>> outputs = model.generate(**inputs, generation_config=generation_config)
142143
>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
143-
['Les fichiers de configuration sont faciles à utiliser !']
144+
['Les fichiers de configuration sont faciles à utiliser!']
144145
```
145146

146147
## Streaming
@@ -217,10 +218,9 @@ The two main parameters that enable and control the behavior of contrastive sear
217218

218219
>>> outputs = model.generate(**inputs, penalty_alpha=0.6, top_k=4, max_new_tokens=100)
219220
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
220-
['Hugging Face Company is a family owned and operated business. \
221-
We pride ourselves on being the best in the business and our customer service is second to none.\
222-
\n\nIf you have any questions about our products or services, feel free to contact us at any time.\
223-
We look forward to hearing from you!']
221+
['Hugging Face Company is a family owned and operated business. We pride ourselves on being the best
222+
in the business and our customer service is second to none.\n\nIf you have any questions about our
223+
products or services, feel free to contact us at any time. We look forward to hearing from you!']
224224
```
225225

226226
### Multinomial sampling
@@ -233,7 +233,8 @@ risk of repetition.
233233
To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
234234

235235
```python
236-
>>> from transformers import AutoTokenizer, AutoModelForCausalLM
236+
>>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
237+
>>> set_seed(0) # For reproducibility
237238

238239
>>> checkpoint = "gpt2-large"
239240
>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
@@ -244,11 +245,8 @@ To enable multinomial sampling set `do_sample=True` and `num_beams=1`.
244245

245246
>>> outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100)
246247
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
247-
['Today was an amazing day because we are now in the final stages of our trip to New York City which was very tough. \
248-
It is a difficult schedule and a challenging part of the year but still worth it. I have been taking things easier and \
249-
I feel stronger and more motivated to be out there on their tour. Hopefully, that experience is going to help them with \
250-
their upcoming events which are currently scheduled in Australia.\n\nWe love that they are here. They want to make a \
251-
name for themselves and become famous for what they']
248+
['Today was an amazing day because when you go to the World Cup and you don\'t, or when you don\'t get invited,
249+
that\'s a terrible feeling."']
252250
```
253251

254252
### Beam-search decoding
@@ -272,7 +270,7 @@ To enable this decoding strategy, specify the `num_beams` (aka number of hypothe
272270

273271
>>> outputs = model.generate(**inputs, num_beams=5, max_new_tokens=50)
274272
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
275-
['It is astonishing how one can have such a profound impact on the lives of so many people in such a short period of \
273+
['It is astonishing how one can have such a profound impact on the lives of so many people in such a short period of
276274
time."\n\nHe added: "I am very proud of the work I have been able to do in the last few years.\n\n"I have']
277275
```
278276

@@ -282,7 +280,8 @@ As the name implies, this decoding strategy combines beam search with multinomia
282280
the `num_beams` greater than 1, and set `do_sample=True` to use this decoding strategy.
283281

284282
```python
285-
>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
283+
>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, set_seed
284+
>>> set_seed(0) # For reproducibility
286285

287286
>>> prompt = "translate English to German: The house is wonderful."
288287
>>> checkpoint = "t5-small"
@@ -309,20 +308,22 @@ The diversily penalty ensures the outputs are distinct across groups, and beam s
309308
>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
310309

311310
>>> checkpoint = "google/pegasus-xsum"
312-
>>> prompt = "The Permaculture Design Principles are a set of universal design principles \
313-
>>> that can be applied to any location, climate and culture, and they allow us to design \
314-
>>> the most efficient and sustainable human habitation and food production systems. \
315-
>>> Permaculture is a design system that encompasses a wide variety of disciplines, such \
316-
>>> as ecology, landscape design, environmental science and energy conservation, and the \
317-
>>> Permaculture design principles are drawn from these various disciplines. Each individual \
318-
>>> design principle itself embodies a complete conceptual framework based on sound \
319-
>>> scientific principles. When we bring all these separate principles together, we can \
320-
>>> create a design system that both looks at whole systems, the parts that these systems \
321-
>>> consist of, and how those parts interact with each other to create a complex, dynamic, \
322-
>>> living system. Each design principle serves as a tool that allows us to integrate all \
323-
>>> the separate parts of a design, referred to as elements, into a functional, synergistic, \
324-
>>> whole system, where the elements harmoniously interact and work together in the most \
325-
>>> efficient way possible."
311+
>>> prompt = (
312+
... "The Permaculture Design Principles are a set of universal design principles "
313+
... "that can be applied to any location, climate and culture, and they allow us to design "
314+
... "the most efficient and sustainable human habitation and food production systems. "
315+
... "Permaculture is a design system that encompasses a wide variety of disciplines, such "
316+
... "as ecology, landscape design, environmental science and energy conservation, and the "
317+
... "Permaculture design principles are drawn from these various disciplines. Each individual "
318+
... "design principle itself embodies a complete conceptual framework based on sound "
319+
... "scientific principles. When we bring all these separate principles together, we can "
320+
... "create a design system that both looks at whole systems, the parts that these systems "
321+
... "consist of, and how those parts interact with each other to create a complex, dynamic, "
322+
... "living system. Each design principle serves as a tool that allows us to integrate all "
323+
... "the separate parts of a design, referred to as elements, into a functional, synergistic, "
324+
... "whole system, where the elements harmoniously interact and work together in the most "
325+
... "efficient way possible."
326+
... )
326327

327328
>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
328329
>>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -369,7 +370,8 @@ When using assisted decoding with sampling methods, you can use the `temperarure
369370
just like in multinomial sampling. However, in assisted decoding, reducing the temperature will help improving latency.
370371

371372
```python
372-
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
373+
>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
374+
>>> set_seed(42) # For reproducibility
373375

374376
>>> prompt = "Alice and Bob"
375377
>>> checkpoint = "EleutherAI/pythia-1.4b-deduped"
@@ -382,5 +384,5 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
382384
>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
383385
>>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
384386
>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
385-
["Alice and Bob are sitting on the sofa. Alice says, 'I'm going to my room"]
387+
['Alice and Bob are going to the same party. It is a small party, in a small']
386388
```

utils/not_doctested.txt

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,12 @@ docs/source/en/custom_models.md
1414
docs/source/en/custom_tools.md
1515
docs/source/en/debugging.md
1616
docs/source/en/fast_tokenizers.md
17-
docs/source/en/generation_strategies.md
1817
docs/source/en/glossary.md
1918
docs/source/en/hpo_train.md
2019
docs/source/en/index.md
2120
docs/source/en/installation.md
2221
docs/source/en/internal/audio_utils.md
2322
docs/source/en/internal/file_utils.md
24-
docs/source/en/internal/generation_utils.md
2523
docs/source/en/internal/image_processing_utils.md
2624
docs/source/en/internal/modeling_utils.md
2725
docs/source/en/internal/pipelines_utils.md
@@ -45,7 +43,6 @@ docs/source/en/main_classes/output.md
4543
docs/source/en/main_classes/pipelines.md
4644
docs/source/en/main_classes/processors.md
4745
docs/source/en/main_classes/quantization.md
48-
docs/source/en/main_classes/text_generation.md
4946
docs/source/en/main_classes/tokenizer.md
5047
docs/source/en/main_classes/trainer.md
5148
docs/source/en/model_doc/albert.md
@@ -367,16 +364,6 @@ src/transformers/dynamic_module_utils.py
367364
src/transformers/feature_extraction_sequence_utils.py
368365
src/transformers/feature_extraction_utils.py
369366
src/transformers/file_utils.py
370-
src/transformers/generation/beam_constraints.py
371-
src/transformers/generation/beam_search.py
372-
src/transformers/generation/flax_logits_process.py
373-
src/transformers/generation/flax_utils.py
374-
src/transformers/generation/stopping_criteria.py
375-
src/transformers/generation/streamers.py
376-
src/transformers/generation/tf_logits_process.py
377-
src/transformers/generation_flax_utils.py
378-
src/transformers/generation_tf_utils.py
379-
src/transformers/generation_utils.py
380367
src/transformers/hf_argparser.py
381368
src/transformers/hyperparameter_search.py
382369
src/transformers/image_processing_utils.py
@@ -413,7 +400,6 @@ src/transformers/models/auto/modeling_tf_auto.py
413400
src/transformers/models/autoformer/configuration_autoformer.py
414401
src/transformers/models/autoformer/modeling_autoformer.py
415402
src/transformers/models/bark/convert_suno_to_hf.py
416-
src/transformers/models/bark/generation_configuration_bark.py
417403
src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py
418404
src/transformers/models/bart/modeling_flax_bart.py
419405
src/transformers/models/bart/modeling_tf_bart.py
@@ -925,9 +911,7 @@ src/transformers/pipelines/object_detection.py
925911
src/transformers/pipelines/pt_utils.py
926912
src/transformers/pipelines/question_answering.py
927913
src/transformers/pipelines/table_question_answering.py
928-
src/transformers/pipelines/text2text_generation.py
929914
src/transformers/pipelines/text_classification.py
930-
src/transformers/pipelines/text_generation.py
931915
src/transformers/pipelines/token_classification.py
932916
src/transformers/pipelines/video_classification.py
933917
src/transformers/pipelines/visual_question_answering.py

utils/slow_documentation_tests.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
1+
docs/source/en/generation_strategies.md
12
docs/source/en/task_summary.md

utils/tests_fetcher.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,31 @@ def get_all_doctest_files() -> List[str]:
395395
return sorted(test_files_to_run)
396396

397397

398+
def get_new_doctest_files(repo, base_commit, branching_commit) -> List[str]:
399+
"""
400+
Get the list of files that were removed from "utils/not_doctested.txt", between `base_commit` and
401+
`branching_commit`.
402+
403+
Returns:
404+
`List[str]`: List of files that were removed from "utils/not_doctested.txt".
405+
"""
406+
for diff_obj in branching_commit.diff(base_commit):
407+
# Ignores all but the "utils/not_doctested.txt" file.
408+
if diff_obj.a_path != "utils/not_doctested.txt":
409+
continue
410+
# Loads the two versions
411+
folder = Path(repo.working_dir)
412+
with checkout_commit(repo, branching_commit):
413+
with open(folder / "utils/not_doctested.txt", "r", encoding="utf-8") as f:
414+
old_content = f.read()
415+
with open(folder / "utils/not_doctested.txt", "r", encoding="utf-8") as f:
416+
new_content = f.read()
417+
# Compute the removed lines and return them
418+
removed_content = set(old_content.split("\n")) - set(new_content.split("\n"))
419+
return sorted(removed_content)
420+
return []
421+
422+
398423
def get_doctest_files(diff_with_last_commit: bool = False) -> List[str]:
399424
"""
400425
Return a list of python and Markdown files where doc example have been modified between:
@@ -426,6 +451,10 @@ def get_doctest_files(diff_with_last_commit: bool = False) -> List[str]:
426451

427452
all_test_files_to_run = get_all_doctest_files()
428453

454+
# Add to the test files to run any removed entry from "utils/not_doctested.txt".
455+
new_test_files = get_new_doctest_files(repo, repo.head.commit, repo.refs.main.commit)
456+
test_files_to_run = list(set(test_files_to_run + new_test_files))
457+
429458
# Do not run slow doctest tests on CircleCI
430459
with open("utils/slow_documentation_tests.txt") as fp:
431460
slow_documentation_tests = set(fp.read().strip().split("\n"))

0 commit comments

Comments
 (0)