diff --git a/.circleci/config.yml b/.circleci/config.yml
index ef1da2791b7de5..da3b76727170f6 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -74,21 +74,20 @@ jobs:
- checkout
- restore_cache:
keys:
- - v0.3-torch_and_tf-{{ checksum "setup.py" }}
- - v0.3-{{ checksum "setup.py" }}
+ - v0.4-torch_and_tf-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- - run: pip install git+https://github.com/huggingface/nlp
- run: pip install .[sklearn,tf-cpu,torch,testing]
- - run: pip install codecov pytest-cov
- save_cache:
- key: v0.3-{{ checksum "setup.py" }}
+ key: v0.4-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ --cov | tee output.txt
- - run: codecov
+ - run: RUN_PT_TF_CROSS_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
- store_artifacts:
- path: ~/transformers/output.txt
- destination: test_output.txt
+ path: ~/transformers/tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/reports
+
run_tests_torch:
working_directory: ~/transformers
docker:
@@ -101,19 +100,20 @@ jobs:
- checkout
- restore_cache:
keys:
- - v0.3-torch-{{ checksum "setup.py" }}
- - v0.3-{{ checksum "setup.py" }}
+ - v0.4-torch-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- - run: pip install git+https://github.com/huggingface/nlp
- run: pip install .[sklearn,torch,testing]
- save_cache:
- key: v0.3-torch-{{ checksum "setup.py" }}
+ key: v0.4-torch-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt
+ - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/tests_output.txt
- store_artifacts:
- path: ~/transformers/output.txt
- destination: test_output.txt
+ path: ~/transformers/reports
+
run_tests_tf:
working_directory: ~/transformers
docker:
@@ -126,19 +126,98 @@ jobs:
- checkout
- restore_cache:
keys:
- - v0.3-tf-{{ checksum "setup.py" }}
- - v0.3-{{ checksum "setup.py" }}
+ - v0.4-tf-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
+ - run: pip install --upgrade pip
+ - run: pip install .[sklearn,tf-cpu,testing]
+ - save_cache:
+ key: v0.4-tf-{{ checksum "setup.py" }}
+ paths:
+ - '~/.cache/pip'
+ - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf ./tests/ | tee tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/reports
+
+ run_tests_flax:
+ working_directory: ~/transformers
+ docker:
+ - image: circleci/python:3.7
+ environment:
+ OMP_NUM_THREADS: 1
+ resource_class: xlarge
+ parallelism: 1
+ steps:
+ - checkout
+ - restore_cache:
+ keys:
+ - v0.4-flax-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
+ - run: pip install --upgrade pip
+ - run: sudo pip install .[flax,sklearn,torch,testing]
+ - save_cache:
+ key: v0.4-flax-{{ checksum "setup.py" }}
+ paths:
+ - '~/.cache/pip'
+ - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax ./tests/ | tee tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/reports
+
+ run_tests_pipelines_torch:
+ working_directory: ~/transformers
+ docker:
+ - image: circleci/python:3.7
+ environment:
+ OMP_NUM_THREADS: 1
+ resource_class: xlarge
+ parallelism: 1
+ steps:
+ - checkout
+ - restore_cache:
+ keys:
+ - v0.4-torch-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
+ - run: pip install --upgrade pip
+ - run: pip install .[sklearn,torch,testing]
+ - save_cache:
+ key: v0.4-torch-{{ checksum "setup.py" }}
+ paths:
+ - '~/.cache/pip'
+ - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/reports
+
+ run_tests_pipelines_tf:
+ working_directory: ~/transformers
+ docker:
+ - image: circleci/python:3.7
+ environment:
+ OMP_NUM_THREADS: 1
+ resource_class: xlarge
+ parallelism: 1
+ steps:
+ - checkout
+ - restore_cache:
+ keys:
+ - v0.4-tf-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- - run: pip install git+https://github.com/huggingface/nlp
- run: pip install .[sklearn,tf-cpu,testing]
- save_cache:
- key: v0.3-tf-{{ checksum "setup.py" }}
+ key: v0.4-tf-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- - run: python -m pytest -n 8 --dist=loadfile -s ./tests/ | tee output.txt
+ - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/tests_output.txt
- store_artifacts:
- path: ~/transformers/output.txt
- destination: test_output.txt
+ path: ~/transformers/reports
+
run_tests_custom_tokenizers:
working_directory: ~/transformers
docker:
@@ -149,19 +228,21 @@ jobs:
- checkout
- restore_cache:
keys:
- - v0.3-custom_tokenizers-{{ checksum "setup.py" }}
- - v0.3-{{ checksum "setup.py" }}
+ - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[ja,testing]
- run: python -m unidic download
- save_cache:
- key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
+ key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- - run: python -m pytest -s ./tests/test_tokenization_bert_japanese.py | tee output.txt
+ - run: python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
+ - store_artifacts:
+ path: ~/transformers/tests_output.txt
- store_artifacts:
- path: ~/transformers/output.txt
- destination: test_output.txt
+ path: ~/transformers/reports
+
run_examples_torch:
working_directory: ~/transformers
docker:
@@ -174,19 +255,21 @@ jobs:
- checkout
- restore_cache:
keys:
- - v0.3-torch_examples-{{ checksum "setup.py" }}
- - v0.3-{{ checksum "setup.py" }}
+ - v0.4-torch_examples-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,torch,testing]
- run: pip install -r examples/requirements.txt
- save_cache:
- key: v0.3-torch_examples-{{ checksum "setup.py" }}
+ key: v0.4-torch_examples-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- - run: python -m pytest -n 8 --dist=loadfile -rA -s ./examples/ | tee output.txt
+ - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
+ - store_artifacts:
+ path: ~/transformers/examples_output.txt
- store_artifacts:
- path: ~/transformers/output.txt
- destination: test_output.txt
+ path: ~/transformers/reports
+
build_doc:
working_directory: ~/transformers
docker:
@@ -195,17 +278,18 @@ jobs:
- checkout
- restore_cache:
keys:
- - v0.3-build_doc-{{ checksum "setup.py" }}
- - v0.3-{{ checksum "setup.py" }}
+ - v0.4-build_doc-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- - run: pip install .[tf,torch,docs]
+ - run: pip install ."[all, docs]"
- save_cache:
- key: v0.3-build_doc-{{ checksum "setup.py" }}
+ key: v0.4-build_doc-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: cd docs && make html SPHINXOPTS="-W"
- store_artifacts:
path: ./docs/_build
+
deploy_doc:
working_directory: ~/transformers
docker:
@@ -217,14 +301,15 @@ jobs:
- checkout
- restore_cache:
keys:
- - v0.3-deploy_doc-{{ checksum "setup.py" }}
- - v0.3-{{ checksum "setup.py" }}
- - run: pip install .[tf,torch,docs]
+ - v0.4-deploy_doc-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
+ - run: pip install ."[all,docs]"
- save_cache:
- key: v0.3-deploy_doc-{{ checksum "setup.py" }}
+ key: v0.4-deploy_doc-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- run: ./.circleci/deploy.sh
+
check_code_quality:
working_directory: ~/transformers
docker:
@@ -235,19 +320,23 @@ jobs:
- checkout
- restore_cache:
keys:
- - v0.3-code_quality-{{ checksum "setup.py" }}
- - v0.3-{{ checksum "setup.py" }}
+ - v0.4-code_quality-{{ checksum "setup.py" }}
+ - v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install isort
- - run: pip install .[tf,torch,quality]
+ - run: pip install .[tf,torch,flax,quality]
- save_cache:
- key: v0.3-code_quality-{{ checksum "setup.py" }}
+ key: v0.4-code_quality-{{ checksum "setup.py" }}
paths:
- '~/.cache/pip'
- - run: black --check --line-length 119 --target-version py35 examples templates tests src utils
- - run: isort --check-only --recursive examples templates tests src utils
- - run: flake8 examples templates tests src utils
+ - run: black --check examples tests src utils
+ - run: isort --check-only examples tests src utils
+ - run: flake8 examples tests src utils
+ - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
+ - run: python utils/check_copies.py
+ - run: python utils/check_dummies.py
- run: python utils/check_repo.py
+
check_repository_consistency:
working_directory: ~/transformers
docker:
@@ -278,6 +367,7 @@ jobs:
- setup_remote_docker
- *build_push_docker
- *deploy_cluster
+
cleanup-gke-jobs:
docker:
- image: circleci/python:3.6
@@ -287,6 +377,7 @@ jobs:
cluster: $GKE_CLUSTER
perform-login: true
- *delete_gke_jobs
+
workflow_filters: &workflow_filters
filters:
branches:
@@ -303,6 +394,9 @@ workflows:
- run_tests_torch_and_tf
- run_tests_torch
- run_tests_tf
+ - run_tests_flax
+ - run_tests_pipelines_torch
+ - run_tests_pipelines_tf
- build_doc
- deploy_doc: *workflow_filters
tpu_testing_jobs:
diff --git a/.circleci/deploy.sh b/.circleci/deploy.sh
index 79d957a1410971..fc17e1b17e0c84 100755
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -47,4 +47,9 @@ deploy_doc "e7cfc1a" v2.9.0
deploy_doc "7cb203f" v2.9.1
deploy_doc "10d7239" v2.10.0
deploy_doc "b42586e" v2.11.0
-deploy_doc "7fb8bdf" #v3.0.2 Latest stable release
\ No newline at end of file
+deploy_doc "7fb8bdf" v3.0.2
+deploy_doc "4b3ee9c" v3.1.0
+deploy_doc "3ebb1b3" v3.2.0
+deploy_doc "0613f05" v3.3.1
+deploy_doc "eb0e0ce" v3.4.0
+deploy_doc "818878d" # v3.5.1 Latest stable release
diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index 7b156536dffd61..05da6062fb11d1 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -30,20 +30,22 @@ assignees: ''
Trainer: @sgugger
Speed and Memory Benchmarks: @patrickvonplaten
Model Cards: @julien-c
- Translation: @sshleifer
- Summarization: @sshleifer
TextGeneration: @TevenLeScao
examples/distillation: @VictorSanh
nlp datasets: [different repo](https://github.com/huggingface/nlp)
rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
- Text Generation: @TevenLeScao
- blenderbot: @mariamabarham
- Bart: @sshleifer
- Marian: @sshleifer
+ Text Generation: @patrickvonplaten @TevenLeScao
+ Blenderbot: @patrickvonplaten
+ Bart: @patrickvonplaten
+ Marian: @patrickvonplaten
+ Pegasus: @patrickvonplaten
+ mBART: @patrickvonplaten
T5: @patrickvonplaten
Longformer/Reformer: @patrickvonplaten
- TransfoXL/XLNet: @TevenLeScao
- examples/seq2seq: @sshleifer
+ TransfoXL/XLNet: @TevenLeScao
+ RAG: @patrickvonplaten, @lhoestq
+ FSMT: @stas00
+ examples/seq2seq: @patil-suraj
examples/bert-loses-patience: @JetRunner
tensorflow: @jplu
examples/token-classification: @stefan-it
diff --git a/.github/ISSUE_TEMPLATE/question-help.md b/.github/ISSUE_TEMPLATE/question-help.md
index 5df4485488f974..87a1a53c1cee22 100644
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -1,6 +1,6 @@
---
name: "❓ Questions & Help"
-about: Post your general questions on the Hugging Face forum or Stack Overflow tagged huggingface-transformers
+about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/
title: ''
labels: ''
assignees: ''
@@ -10,18 +10,17 @@ assignees: ''
# ❓ Questions & Help
## Details
+
-
-**A link to original question on the forum/Stack Overflow**:
\ No newline at end of file
+
+
+**A link to original question on the forum**:
+
+
\ No newline at end of file
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 0a2a2f1614bf2b..f7f6fdeca3324a 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,2 +1,62 @@
-
-Fixes #{issue number}
+# What does this PR do?
+
+
+
+
+
+Fixes # (issue)
+
+
+## Before submitting
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#start-contributing-pull-requests),
+ Pull Request section?
+- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
+ to it if that's the case.
+- [ ] Did you make sure to update the documentation with your changes? Here are the
+ [documentation guidelines](https://github.com/huggingface/transformers/tree/master/docs), and
+ [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/master/docs#writing-source-documentation).
+- [ ] Did you write any new necessary tests?
+
+
+## Who can review?
+
+Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
+members/contributors which may be interested in your PR.
+
+
diff --git a/.github/workflows/github-torch-hub.yml b/.github/workflows/github-torch-hub.yml
index cb8b21a88d46fb..93b9c777bfe4d3 100644
--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@@ -8,6 +8,9 @@ on:
jobs:
torch_hub_integration:
runs-on: ubuntu-latest
+ env:
+ # TODO quickfix but may need more investigation
+ ACTIONS_ALLOW_UNSECURE_COMMANDS: True
steps:
# no checkout necessary here.
- name: Extract branch name
@@ -30,7 +33,7 @@ jobs:
run: |
pip install --upgrade pip
pip install torch
- pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging
+ pip install numpy filelock protobuf requests tqdm regex sentencepiece sacremoses tokenizers packaging
- name: Torch hub list
run: |
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index c855137f35ba76..0957f2f865cc75 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -1,64 +1,273 @@
name: Self-hosted runner (push)
-on:
+on:
push:
branches:
- master
- paths:
+ - model-templates
+ paths:
- "src/**"
- "tests/**"
- ".github/**"
+ - "templates/**"
# pull_request:
repository_dispatch:
jobs:
- run_tests_torch_and_tf_gpu:
- runs-on: self-hosted
+ run_tests_torch_gpu:
+ runs-on: [self-hosted, gpu, single-gpu]
steps:
- - uses: actions/checkout@v2
- - name: Python version
- run: |
- which python
- python --version
- pip --version
- - name: Current dir
- run: pwd
- - run: nvidia-smi
-
- - name: Loading cache.
- uses: actions/cache@v2
- id: cache
- with:
- path: .env
- key: v0-tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
-
- - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
- run: |
- python -m venv .env
- source .env/bin/activate
- which python
- python --version
- pip --version
- - name: Install dependencies
- run: |
- source .env/bin/activate
- pip install --upgrade pip
- pip install torch!=1.6.0
- pip install .[sklearn,testing,onnxruntime]
- pip install git+https://github.com/huggingface/nlp
-
- - name: Are GPUs recognized by our DL frameworks
- run: |
- source .env/bin/activate
- python -c "import torch; print(torch.cuda.is_available())"
-
- - name: Run all non-slow tests on GPU
- env:
- TF_FORCE_GPU_ALLOW_GROWTH: "true"
- # TF_GPU_MEMORY_LIMIT: 4096
- OMP_NUM_THREADS: 1
- USE_CUDA: yes
- run: |
- source .env/bin/activate
- python -m pytest -n 2 --dist=loadfile -s ./tests/
+ - uses: actions/checkout@v2
+ - name: Python version
+ run: |
+ which python
+ python --version
+ pip --version
+
+ - name: Current dir
+ run: pwd
+ - run: nvidia-smi
+
+ - name: Loading cache.
+ uses: actions/cache@v2
+ id: cache
+ with:
+ path: .env
+ key: v1.1-tests_torch_gpu-${{ hashFiles('setup.py') }}
+
+ - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+ run: |
+ python -m venv .env
+ source .env/bin/activate
+ which python
+ python --version
+ pip --version
+
+ - name: Install dependencies
+ run: |
+ source .env/bin/activate
+ pip install --upgrade pip
+ pip install .[torch,sklearn,testing,onnxruntime]
+ pip install git+https://github.com/huggingface/datasets
+
+ - name: Are GPUs recognized by our DL frameworks
+ run: |
+ source .env/bin/activate
+ python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+ python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+ - name: Create model files
+ run: |
+ source .env/bin/activate
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
+ - name: Run all non-slow tests on GPU
+ env:
+ OMP_NUM_THREADS: 1
+ CUDA_VISIBLE_DEVICES: 0
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_torch_gpu_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: run_all_tests_torch_gpu_test_reports
+ path: reports
+
+
+ run_tests_tf_gpu:
+ runs-on: [self-hosted, gpu, single-gpu]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Python version
+ run: |
+ which python
+ python --version
+ pip --version
+ - name: Current dir
+ run: pwd
+ - run: nvidia-smi
+
+ - name: Loading cache.
+ uses: actions/cache@v2
+ id: cache
+ with:
+ path: .env
+ key: v1.1-tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+ - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+ run: |
+ python -m venv .env
+ source .env/bin/activate
+ which python
+ python --version
+ pip --version
+
+ - name: Install dependencies
+ run: |
+ source .env/bin/activate
+ pip install --upgrade pip
+ pip install .[tf,sklearn,testing,onnxruntime]
+ pip install git+https://github.com/huggingface/datasets
+
+ - name: Are GPUs recognized by our DL frameworks
+ run: |
+ source .env/bin/activate
+ TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+ TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+ - name: Create model files
+ run: |
+ source .env/bin/activate
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+ transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
+ - name: Run all non-slow tests on GPU
+ env:
+ OMP_NUM_THREADS: 1
+ CUDA_VISIBLE_DEVICES: 0
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_tf_gpu_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: run_all_tests_tf_gpu_test_reports
+ path: reports
+
+ run_tests_torch_multi_gpu:
+ runs-on: [self-hosted, gpu, multi-gpu]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Python version
+ run: |
+ which python
+ python --version
+ pip --version
+
+ - name: Current dir
+ run: pwd
+ - run: nvidia-smi
+
+ - name: Loading cache.
+ uses: actions/cache@v2
+ id: cache
+ with:
+ path: .env
+ key: v1.1-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+ - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+ run: |
+ python -m venv .env
+ source .env/bin/activate
+ which python
+ python --version
+ pip --version
+ - name: Install dependencies
+ run: |
+ source .env/bin/activate
+ pip install --upgrade pip
+ pip install .[torch,sklearn,testing,onnxruntime]
+ pip install git+https://github.com/huggingface/datasets
+
+ - name: Are GPUs recognized by our DL frameworks
+ run: |
+ source .env/bin/activate
+ python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+ python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+ - name: Run all non-slow tests on GPU
+ env:
+ OMP_NUM_THREADS: 1
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_torch_multi_gpu_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: run_all_tests_torch_multi_gpu_test_reports
+ path: reports
+
+ run_tests_tf_multi_gpu:
+ runs-on: [self-hosted, gpu, multi-gpu]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Python version
+ run: |
+ which python
+ python --version
+ pip --version
+
+ - name: Current dir
+ run: pwd
+ - run: nvidia-smi
+
+ - name: Loading cache.
+ uses: actions/cache@v2
+ id: cache
+ with:
+ path: .env
+ key: v1.1-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+ - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+ run: |
+ python -m venv .env
+ source .env/bin/activate
+ which python
+ python --version
+ pip --version
+ - name: Install dependencies
+ run: |
+ source .env/bin/activate
+ pip install --upgrade pip
+ pip install .[tf,sklearn,testing,onnxruntime]
+ pip install git+https://github.com/huggingface/datasets
+
+ - name: Are GPUs recognized by our DL frameworks
+ run: |
+ source .env/bin/activate
+ TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+ TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+ - name: Run all non-slow tests on GPU
+ env:
+ OMP_NUM_THREADS: 1
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_tf_multi_gpu_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: run_all_tests_tf_multi_gpu_test_reports
+ path: reports
+
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 243ade6afe8730..592733b5ba607d 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -1,72 +1,360 @@
+# configuration notes:
+#
+# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
+# the step uses the system-wide python interpreter.
+
name: Self-hosted runner (scheduled)
on:
push:
branches:
- ci_*
+ - framework-agnostic-tokenizers
repository_dispatch:
schedule:
- cron: "0 0 * * *"
jobs:
- run_all_tests_torch_and_tf_gpu:
- runs-on: self-hosted
+ run_all_tests_torch_gpu:
+ runs-on: [self-hosted, gpu, single-gpu]
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Loading cache.
+ uses: actions/cache@v2
+ id: cache
+ with:
+ path: .env
+ key: v 1.1-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
+
+ - name: Python version
+ run: |
+ which python
+ python --version
+ pip --version
+
+ - name: Current dir
+ run: pwd
+ - run: nvidia-smi
+
+ - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+ if: steps.cache.outputs.cache-hit != 'true'
+ run: |
+ python -m venv .env
+ source .env/bin/activate
+ which python
+ python --version
+ pip --version
+
+ - name: Install dependencies
+ run: |
+ source .env/bin/activate
+ pip install --upgrade pip
+ pip install .[torch,sklearn,testing,onnxruntime]
+ pip install git+https://github.com/huggingface/datasets
+ pip list
+
+ - name: Are GPUs recognized by our DL frameworks
+ run: |
+ source .env/bin/activate
+ python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+ python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+ - name: Run all tests on GPU
+ env:
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_torch_gpu_failures_short.txt
+
+ - name: Run examples tests on GPU
+ if: ${{ always() }}
+ env:
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ run: |
+ source .env/bin/activate
+ pip install -r examples/requirements.txt
+ python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/examples_torch_gpu_failures_short.txt
+
+ - name: Run all pipeline tests on GPU
+ if: ${{ always() }}
+ env:
+ TF_FORCE_GPU_ALLOW_GROWTH: "true"
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ RUN_PIPELINE_TESTS: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: run_all_tests_torch_gpu_test_reports
+ path: reports
+
+
+ run_all_tests_tf_gpu:
+ runs-on: [self-hosted, gpu, single-gpu]
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Loading cache.
+ uses: actions/cache@v2
+ id: cache
+ with:
+ path: .env
+ key: v1.1-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+ - name: Python version
+ run: |
+ which python
+ python --version
+ pip --version
+
+ - name: Current dir
+ run: pwd
+ - run: nvidia-smi
+
+ - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+ if: steps.cache.outputs.cache-hit != 'true'
+ run: |
+ python -m venv .env
+ source .env/bin/activate
+ which python
+ python --version
+ pip --version
+
+ - name: Install dependencies
+ run: |
+ source .env/bin/activate
+ pip install --upgrade pip
+ pip install .[tf,sklearn,testing,onnxruntime]
+ pip install git+https://github.com/huggingface/datasets
+ pip list
+
+ - name: Are GPUs recognized by our DL frameworks
+ run: |
+ source .env/bin/activate
+ TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+ TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+ - name: Run all tests on GPU
+ env:
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_tf_gpu_failures_short.txt
+
+ - name: Run all pipeline tests on GPU
+ if: ${{ always() }}
+ env:
+ TF_FORCE_GPU_ALLOW_GROWTH: "true"
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ RUN_PIPELINE_TESTS: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_tf_pipelines_gpu_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: run_all_tests_tf_gpu_test_reports
+ path: reports
+
+ run_all_tests_torch_multi_gpu:
+ runs-on: [self-hosted, gpu, multi-gpu]
+ steps:
+ - uses: actions/checkout@v2
+
+ - name: Loading cache.
+ uses: actions/cache@v2
+ id: cache
+ with:
+ path: .env
+ key: v1.1-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+ - name: Python version
+ run: |
+ which python
+ python --version
+ pip --version
+
+ - name: Current dir
+ run: pwd
+ - run: nvidia-smi
+
+ - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+ if: steps.cache.outputs.cache-hit != 'true'
+ run: |
+ python -m venv .env
+ source .env/bin/activate
+ which python
+ python --version
+ pip --version
+
+ - name: Install dependencies
+ run: |
+ source .env/bin/activate
+ pip install --upgrade pip
+ pip install .[torch,sklearn,testing,onnxruntime]
+ pip install git+https://github.com/huggingface/datasets
+ pip list
+
+ - name: Are GPUs recognized by our DL frameworks
+ run: |
+ source .env/bin/activate
+ python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+ python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+ - name: Run all tests on multi-GPU
+ env:
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_torch_multi_gpu_failures_short.txt
+
+ - name: Run examples tests on multi-GPU
+ env:
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multi_gpu examples
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/examples_torch_multi_gpu_failures_short.txt
+
+ - name: Run all pipeline tests on multi-GPU
+ if: ${{ always() }}
+ env:
+ TF_FORCE_GPU_ALLOW_GROWTH: "true"
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ RUN_PIPELINE_TESTS: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: run_all_tests_torch_multi_gpu_test_reports
+ path: reports
+
+ run_all_tests_tf_multi_gpu:
+ runs-on: [self-hosted, gpu, multi-gpu]
steps:
- - uses: actions/checkout@v2
-
- - name: Loading cache.
- uses: actions/cache@v2
- id: cache
- with:
- path: .env
- key: v0-slow_tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
-
- - name: Python version
- run: |
- which python
- python --version
- pip --version
- - name: Current dir
- run: pwd
- - run: nvidia-smi
- - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
- if: steps.cache.outputs.cache-hit != 'true'
- run: |
- python -m venv .env
- source .env/bin/activate
- which python
- python --version
- pip --version
- - name: Install dependencies
- run: |
- source .env/bin/activate
- pip install --upgrade pip
- pip install torch!=1.6.0
- pip install .[sklearn,testing,onnxruntime]
- pip install git+https://github.com/huggingface/nlp
-
- - name: Are GPUs recognized by our DL frameworks
- run: |
- source .env/bin/activate
- python -c "import torch; print(torch.cuda.is_available())"
-
- - name: Run all tests on GPU
- env:
- TF_FORCE_GPU_ALLOW_GROWTH: "true"
- OMP_NUM_THREADS: 1
- RUN_SLOW: yes
- USE_CUDA: yes
- run: |
- source .env/bin/activate
- python -m pytest -n 1 --dist=loadfile -s ./tests/
-
- - name: Run examples tests on GPU
- env:
- TF_FORCE_GPU_ALLOW_GROWTH: "true"
- OMP_NUM_THREADS: 1
- RUN_SLOW: yes
- USE_CUDA: yes
- run: |
- source .env/bin/activate
- pip install -r examples/requirements.txt
- python -m pytest -n 1 --dist=loadfile -s examples
+ - uses: actions/checkout@v2
+
+ - name: Loading cache.
+ uses: actions/cache@v2
+ id: cache
+ with:
+ path: .env
+ key: v1.1-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+ - name: Python version
+ run: |
+ which python
+ python --version
+ pip --version
+
+ - name: Current dir
+ run: pwd
+ - run: nvidia-smi
+
+ - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+ if: steps.cache.outputs.cache-hit != 'true'
+ run: |
+ python -m venv .env
+ source .env/bin/activate
+ which python
+ python --version
+ pip --version
+
+ - name: Install dependencies
+ run: |
+ source .env/bin/activate
+ pip install --upgrade pip
+ pip install .[tf,sklearn,testing,onnxruntime]
+ pip install git+https://github.com/huggingface/datasets
+ pip list
+
+ - name: Are GPUs recognized by our DL frameworks
+ run: |
+ source .env/bin/activate
+ TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+ TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+ - name: Run all tests on multi-GPU
+ env:
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_tf_multi_gpu_failures_short.txt
+
+ - name: Run all pipeline tests on multi-GPU
+ if: ${{ always() }}
+ env:
+ TF_FORCE_GPU_ALLOW_GROWTH: "true"
+ OMP_NUM_THREADS: 1
+ RUN_SLOW: yes
+ RUN_PIPELINE_TESTS: yes
+ run: |
+ source .env/bin/activate
+ python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_multi_gpu tests
+
+ - name: Failure short reports
+ if: ${{ always() }}
+ run: cat reports/tests_tf_multi_gpu_pipelines_failures_short.txt
+
+ - name: Test suite reports artifacts
+ if: ${{ always() }}
+ uses: actions/upload-artifact@v2
+ with:
+ name: run_all_tests_tf_multi_gpu_test_reports
+ path: reports
+
diff --git a/.gitignore b/.gitignore
index 7da929be8c2f8f..4137137f2853c7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,8 +9,11 @@ __pycache__/
*.so
# tests and logs
-tests/fixtures
+tests/fixtures/*
+!tests/fixtures/sample_text_no_unicode.txt
logs/
+lightning_logs/
+lang_code_data/
# Distribution / packaging
.Python
@@ -130,7 +133,6 @@ dmypy.json
tensorflow_code
# Models
-models
proc_data
# examples
@@ -139,6 +141,7 @@ runs
/wandb
/examples/runs
/examples/**/*.args
+/examples/rag/sweep
# data
/data
@@ -153,3 +156,6 @@ debug.env
#ctags
tags
+
+# pre-commit
+.pre-commit*
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000000000..c8ad966288a9fa
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,129 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+ and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+ overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+ advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+ address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 75615278282e29..8f18d2e2ba7067 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,6 +9,9 @@ It also helps us if you spread the word: reference the library from blog posts
on the awesome projects it made possible, shout out on Twitter every time it has
helped you, or simply star the repo to say "thank you".
+Whichever way you choose to contribute, please be mindful to respect our
+[code of conduct](https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md).
+
## You can contribute in so many ways!
There are 4 ways you can contribute to transformers:
@@ -93,7 +96,7 @@ folder.
## Start contributing! (Pull Requests)
-Before writing code, we strongly advise you to search through the exising PRs or
+Before writing code, we strongly advise you to search through the existing PRs or
issues to make sure that nobody is already working on the same thing. If you are
unsure, it is always a good idea to open an issue to get some feedback.
@@ -134,6 +137,18 @@ Follow these steps to start contributing:
it with `pip uninstall transformers` before reinstalling it in editable
mode with the `-e` flag.)
+ To run the full test suite, you might need the additional dependency on `datasets` which requires a separate source
+ install:
+
+ ```bash
+ $ git clone https://github.com/huggingface/datasets
+ $ cd datasets
+ $ pip install -e .
+ ```
+
+ If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `datasets`
+ library.
+
5. Develop the features on your branch.
As you work on the features, you should make sure that the test suite
@@ -158,12 +173,19 @@ Follow these steps to start contributing:
$ make style
```
- `transformers` also uses `flake8` to check for coding mistakes. Quality
+ `transformers` also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
control runs in CI, however you can also run the same checks with:
```bash
$ make quality
```
+ You can do the automatic style corrections and code verifications that can't be automated in one go:
+
+ ```bash
+ $ make fixup
+ ```
+
+ This target is also optimized to only work with files modified by the PR you're working on.
If you're modifying documents under `docs/source`, make sure to validate that
they can still be built. This check also runs in CI. To run a local check
@@ -213,7 +235,7 @@ Follow these steps to start contributing:
### Checklist
1. The title of your pull request should be a summary of its contribution;
-2. If your pull request adresses an issue, please mention the issue number in
+2. If your pull request addresses an issue, please mention the issue number in
the pull request description to make sure they are linked (and people
consulting the issue know you are working on it);
3. To indicate a work in progress please prefix the title with `[WIP]`. These
@@ -286,3 +308,12 @@ Check our [documentation writing guide](https://github.com/huggingface/transform
for more information.
#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
+
+
+### Develop on Windows
+
+One way one can run the make command on Window is to pass by MSYS2:
+
+1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64
+2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu)
+3. Run in the shell: `pacman -Syu` and install make with `pacman -S make`
diff --git a/Makefile b/Makefile
index 62215da63e4919..4ef8b924ef0f8a 100644
--- a/Makefile
+++ b/Makefile
@@ -1,18 +1,51 @@
-.PHONY: quality style test test-examples docs
+.PHONY: modified_only_fixup extra_quality_checks quality style fixup fix-copies test test-examples docs
+
+
+check_dirs := examples tests src utils
+
+modified_only_fixup:
+ $(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
+ @if test -n "$(modified_py_files)"; then \
+ echo "Checking/fixing $(modified_py_files)"; \
+ black $(modified_py_files); \
+ isort $(modified_py_files); \
+ flake8 $(modified_py_files); \
+ else \
+ echo "No library .py files were modified"; \
+ fi
# Check that source code meets quality standards
-quality:
- black --check --line-length 119 --target-version py35 examples templates tests src utils
- isort --check-only examples templates tests src utils
- flake8 examples templates tests src utils
+extra_quality_checks:
+ python utils/check_copies.py
+ python utils/check_dummies.py
python utils/check_repo.py
+ python utils/style_doc.py src/transformers docs/source --max_len 119
+
+# this target runs checks on all files
+quality:
+ black --check $(check_dirs)
+ isort --check-only $(check_dirs)
+ flake8 $(check_dirs)
+ python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
+ ${MAKE} extra_quality_checks
-# Format source code automatically
+# Format source code automatically and check is there are any problems left that need manual fixing
style:
- black --line-length 119 --target-version py35 examples templates tests src utils
- isort examples templates tests src utils
+ black $(check_dirs)
+ isort $(check_dirs)
+ python utils/style_doc.py src/transformers docs/source --max_len 119
+
+# Super fast fix and check target that only works on relevant modified files since the branch was made
+
+fixup: modified_only_fixup extra_quality_checks
+
+# Make marked copies of snippets of codes conform to the original
+
+fix-copies:
+ python utils/check_copies.py --fix_and_overwrite
+ python utils/check_dummies.py --fix_and_overwrite
# Run tests for the library
diff --git a/README.md b/README.md
index 9d822511384c58..ac2e588de43e93 100644
--- a/README.md
+++ b/README.md
@@ -16,551 +16,62 @@
+
+
+
State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, T5, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over thousands of pretrained models in 100+ languages and deep interoperability between PyTorch & TensorFlow 2.0.
-
-### Recent contributors
-[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)
-
-### Features
-- High performance on NLU and NLG tasks
-- Low barrier to entry for educators and practitioners
-
-State-of-the-art NLP for everyone
-- Deep learning researchers
-- Hands-on practitioners
-- AI/ML/NLP teachers and educators
-
-Lower compute costs, smaller carbon footprint
-- Researchers can share trained models instead of always retraining
-- Practitioners can reduce compute time and production costs
-- Dozens of architectures with over 1,000 pretrained models, some in more than 100 languages
-
-Choose the right framework for every part of a model's lifetime
-- Train state-of-the-art models in 3 lines of code
-- Deep interoperability between TensorFlow 2.0 and PyTorch models
-- Move a single model between TF2.0/PyTorch frameworks at will
-- Seamlessly pick the right framework for training, evaluation, production
-
-
-| Section | Description |
-|-|-|
-| [Installation](#installation) | How to install the package |
-| [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
-| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
-| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
-| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
-| [Quick tour: pipelines](#quick-tour-of-pipelines) | Using Pipelines: Wrapper around tokenizer and models to use finetuned models |
-| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
-| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
-
-## Installation
-
-This repo is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for examples) and TensorFlow 2.0.
-
-You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-
-Create a virtual environment with the version of Python you're going to use and activate it.
-
-Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you must install it from source.
-
-### With pip
-
-First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
-
-When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
-
-```bash
-pip install transformers
-```
-
-### From source
-
-Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
-
-When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install .
-```
-
-When you update the repository, you should upgrade the transformers installation and its dependencies as follows:
-
-```bash
-git pull
-pip install --upgrade .
-```
-
-### Run the examples
-
-Examples are included in the repository but are not shipped with the library.
-
-Therefore, in order to run the latest versions of the examples, you need to install from source, as described above.
-
-Look at the [README](https://github.com/huggingface/transformers/blob/master/examples/README.md) for how to run examples.
-
-### Tests
+🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone.
-A series of tests are included for the library and for some example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
+🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture can be used as a standalone and modified to enable quick research experiments.
-Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
+🤗 Transformers is backed by the two most popular deep learning libraries, [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/), with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.
-Here's the easiest way to run tests for the library:
-
-```bash
-pip install -e ".[testing]"
-make test
-```
-
-and for the examples:
-
-```bash
-pip install -e ".[testing]"
-pip install -r examples/requirements.txt
-make test-examples
-```
-
-For details, refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests).
-
-### Do you want to run a Transformer model on a mobile device?
-
-You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
-
-It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
-
-## Model architectures
-
-🤗 Transformers currently provides the following NLU/NLG architectures:
-
-1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-9. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-13. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-15. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-16. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-17. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-18. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-19. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-20. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-21. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-22. **[DPR](https://github.com/facebookresearch/DPR)** (from Facebook) released with the paper [Dense Passage Retrieval
-for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
-Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-23. **[Pegasus](https://github.com/google-research/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-24. **[MBart](https://github.com/pytorch/fairseq/tree/master/examples/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-25. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
-26. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
-
-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Pearson R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
+### Recent contributors
+[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)
-## Online demo
+## Online demos
-You can test our inference API on most model pages from the model hub: https://huggingface.co/models
+You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer an [inference API](https://huggingface.co/pricing) to use those models.
-For example:
+Here are a few examples:
- [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
-- [NER with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [Name Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
- [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
-- [NLI with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Natural Langugage Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
- [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
- [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
-
-**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
+**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team, is the official demo of this repo’s text generation capabilities.
## Quick tour
-Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
+To immediately use a model on a given text, we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model training. Here is how to quickly use a pipeline to classify positive versus negative texts
```python
-import torch
-from transformers import *
-
-# Transformers has a unified API
-# for 10 transformer architectures and 30 pretrained weights.
-# Model | Tokenizer | Pretrained weights shortcut
-MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'),
- (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'),
- (GPT2Model, GPT2Tokenizer, 'gpt2'),
- (CTRLModel, CTRLTokenizer, 'ctrl'),
- (TransfoXLModel, TransfoXLTokenizer, 'transfo-xl-wt103'),
- (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'),
- (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'),
- (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
- (RobertaModel, RobertaTokenizer, 'roberta-base'),
- (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
- ]
-
-# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
-
-# Let's encode some text in a sequence of hidden-states using each model:
-for model_class, tokenizer_class, pretrained_weights in MODELS:
- # Load pretrained model/tokenizer
- tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
- model = model_class.from_pretrained(pretrained_weights)
-
- # Encode text
- input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)]) # Add special tokens takes care of adding [CLS], [SEP], ... tokens in the right way for each model.
- with torch.no_grad():
- last_hidden_states = model(input_ids)[0] # Models outputs are now tuples
-
-# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
-BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
- BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]
-
-# All the classes for an architecture can be initiated from pretrained weights for this architecture
-# Note that additional weights added for fine-tuning are only initialized
-# and need to be trained on the down-stream task
-pretrained_weights = 'bert-base-uncased'
-tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
-for model_class in BERT_MODEL_CLASSES:
- # Load pretrained model/tokenizer
- model = model_class.from_pretrained(pretrained_weights)
-
- # Models can return full list of hidden-states & attentions weights at each layer
- model = model_class.from_pretrained(pretrained_weights,
- output_hidden_states=True,
- output_attentions=True)
- input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
- all_hidden_states, all_attentions = model(input_ids)[-2:]
-
- # Models are compatible with Torchscript
- model = model_class.from_pretrained(pretrained_weights, torchscript=True)
- traced_model = torch.jit.trace(model, (input_ids,))
-
- # Simple serialization for models and tokenizers
- model.save_pretrained('./directory/to/save/') # save
- model = model_class.from_pretrained('./directory/to/save/') # re-load
- tokenizer.save_pretrained('./directory/to/save/') # save
- tokenizer = BertTokenizer.from_pretrained('./directory/to/save/') # re-load
-
- # SOTA examples for GLUE, SQUAD, text generation...
-```
-
-## Quick tour TF 2.0 training and PyTorch interoperability
-
-Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
-
-```python
-import tensorflow as tf
-import tensorflow_datasets
-from transformers import *
-
-# Load dataset, tokenizer, model from pretrained model/vocabulary
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
-data = tensorflow_datasets.load('glue/mrpc')
-
-# Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
-train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
-valid_dataset = valid_dataset.batch(64)
-
-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-# Train and evaluate using tf.keras.Model.fit()
-history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
- validation_data=valid_dataset, validation_steps=7)
-
-# Load the TensorFlow model in PyTorch for inspection
-model.save_pretrained('./save/')
-pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
-
-# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = "This research was consistent with his findings."
-sentence_1 = "His findings were compatible with this research."
-sentence_2 = "His findings were not compatible with this research."
-inputs_1 = tokenizer(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
-
-pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
-pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()
-
-print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
-print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
-```
-
-## Quick tour of the fine-tuning/usage scripts
-
-**Important**
-Before running the fine-tuning scripts, please read the
-[instructions](#run-the-examples) on how to
-setup your environment to run the examples.
-
-The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
-
-- `run_glue.py`: an example fine-tuning sequence classification models on nine different GLUE tasks (*sequence-level classification*)
-- `run_squad.py`: an example fine-tuning question answering models on the question answering dataset SQuAD 2.0 (*token-level classification*)
-- `run_ner.py`: an example fine-tuning token classification models on named entity recognition (*token-level classification*)
-- `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation
-- other model-specific examples (see the documentation).
-
-Here are three quick usage examples for these scripts:
-
-### `run_glue.py`: Fine-tuning on GLUE tasks for sequence classification
-
-The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.
-
-Before running any of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-You should also install the additional packages required by the examples:
-
-```shell
-pip install -r ./examples/requirements.txt
-```
-
-```shell
-export GLUE_DIR=/path/to/glue
-export TASK_NAME=MRPC
-
-python ./examples/text-classification/run_glue.py \
- --model_name_or_path bert-base-uncased \
- --task_name $TASK_NAME \
- --do_train \
- --do_eval \
- --data_dir $GLUE_DIR/$TASK_NAME \
- --max_seq_length 128 \
- --per_device_eval_batch_size=8 \
- --per_device_train_batch_size=8 \
- --learning_rate 2e-5 \
- --num_train_epochs 3.0 \
- --output_dir /tmp/$TASK_NAME/
-```
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
-
-#### Fine-tuning XLNet model on the STS-B regression task
-
-This example code fine-tunes XLNet on the STS-B corpus using parallel training on a server with 4 V100 GPUs.
-Parallel training is a simple way to use several GPUs (but is slower and less flexible than distributed training, see below).
-
-```shell
-export GLUE_DIR=/path/to/glue
-
-python ./examples/text-classification/run_glue.py \
- --model_name_or_path xlnet-large-cased \
- --do_train \
- --do_eval \
- --task_name=sts-b \
- --data_dir=${GLUE_DIR}/STS-B \
- --output_dir=./proc_data/sts-b-110 \
- --max_seq_length=128 \
- --per_device_eval_batch_size=8 \
- --per_device_train_batch_size=8 \
- --gradient_accumulation_steps=1 \
- --max_steps=1200 \
- --model_name=xlnet-large-cased \
- --overwrite_output_dir \
- --overwrite_cache \
- --warmup_steps=120
-```
-
-On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should result in a Pearson correlation coefficient of `+0.917` on the development set.
-
-#### Fine-tuning Bert model on the MRPC classification task
-
-This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.
-
-```bash
-python -m torch.distributed.launch --nproc_per_node 8 ./examples/text-classification/run_glue.py \
- --model_name_or_path bert-large-uncased-whole-word-masking \
- --task_name MRPC \
- --do_train \
- --do_eval \
- --data_dir $GLUE_DIR/MRPC/ \
- --max_seq_length 128 \
- --per_device_eval_batch_size=8 \
- --per_device_train_batch_size=8 \
- --learning_rate 2e-5 \
- --num_train_epochs 3.0 \
- --output_dir /tmp/mrpc_output/ \
- --overwrite_output_dir \
- --overwrite_cache \
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
- acc = 0.8823529411764706
- acc_and_f1 = 0.901702786377709
- eval_loss = 0.3418912578906332
- f1 = 0.9210526315789473
- global_step = 174
- loss = 0.07231863956341798
-```
-
-### `run_squad.py`: Fine-tuning on SQuAD for question-answering
-
-This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
- --model_type bert \
- --model_name_or_path bert-large-uncased-whole-word-masking \
- --do_train \
- --do_eval \
- --train_file $SQUAD_DIR/train-v1.1.json \
- --predict_file $SQUAD_DIR/dev-v1.1.json \
- --learning_rate 3e-5 \
- --num_train_epochs 2 \
- --max_seq_length 384 \
- --doc_stride 128 \
- --output_dir ../models/wwm_uncased_finetuned_squad/ \
- --per_device_eval_batch_size=3 \
- --per_device_train_batch_size=3 \
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
-```
-
-This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
-
-### `run_generation.py`: Text generation with GPT, GPT-2, CTRL, Transformer-XL and XLNet
-
-A conditional generation script is also included to generate text from a prompt.
-The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
-
-Here is how to run the script with the small version of OpenAI GPT-2 model:
-
-```shell
-python ./examples/text-generation/run_generation.py \
- --model_type=gpt2 \
- --length=20 \
- --model_name_or_path=gpt2 \
-```
-
-and from the Salesforce CTRL model:
-```shell
-python ./examples/text-generation/run_generation.py \
- --model_type=ctrl \
- --length=20 \
- --model_name_or_path=ctrl \
- --temperature=0 \
- --repetition_penalty=1.2 \
-```
-
-## Quick tour of model sharing
-
-Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the CLI that's built-in to the library.
-
-**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Optionally, join an existing organization or create a new one. Then:
-
-```shell
-transformers-cli login
-# log in using the same credentials as on huggingface.co
-```
-Upload your model:
-```shell
-transformers-cli upload ./path/to/pretrained_model/
-
-# ^^ Upload folder containing weights/tokenizer/config
-# saved via `.save_pretrained()`
-
-transformers-cli upload ./config.json [--filename folder/foobar.json]
-
-# ^^ Upload a single file
-# (you can optionally override its filename, which can be nested inside a folder)
-```
-
-If you want your model to be namespaced by your organization name rather than your username, add the following flag to any command:
-```shell
---organization organization_name
-```
-
-Your model will then be accessible through its identifier, a concatenation of your username (or organization name) and the folder name above:
-```python
-"username/pretrained_model"
-# or if an org:
-"organization_name/pretrained_model"
-```
-
-**Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hardware used, hyperparameters), evaluation results, intended uses & limitations, etc.
-
-Your model now has a page on huggingface.co/models 🔥
-
-Anyone can load it from code:
-```python
-tokenizer = AutoTokenizer.from_pretrained("namespace/pretrained_model")
-model = AutoModel.from_pretrained("namespace/pretrained_model")
-```
-
-List all your files on S3:
-```shell
-transformers-cli s3 ls
-```
-
-You can also delete unneeded files:
+>>> from transformers import pipeline
-```shell
-transformers-cli s3 rm …
+# Allocate a pipeline for sentiment-analysis
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to include pipeline into the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9978193640708923}]
```
-## Quick tour of pipelines
-
-New in version `v2.3`: `Pipeline` are high-level objects which automatically handle tokenization, running your data through a transformers model
-and outputting the result in a structured object.
-
-You can create `Pipeline` objects for the following down-stream tasks:
+The second line of code downloads and caches the pretrained model used by the pipeline, the third line evaluates it on the given text. Here the answer is "positive" with a confidence of 99.8%.
- - `feature-extraction`: Generates a tensor representation for the input sequence
- - `ner`: Generates named entity mapping for each word in the input sequence.
- - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence.
- - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
- - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
- - `fill-mask`: Takes an input sequence containing a masked token (e.g. ``) and return list of most probable filled sequences, with their probabilities.
- - `summarization`
- - `translation_xx_to_yy`
+This is another example of pipeline used for that can extract question answers from some context:
-```python
+``` python
>>> from transformers import pipeline
-# Allocate a pipeline for sentiment-analysis
->>> nlp = pipeline('sentiment-analysis')
->>> nlp('We are very happy to include pipeline into the transformers repository.')
-[{'label': 'POSITIVE', 'score': 0.9978193640708923}]
-
# Allocate a pipeline for question-answering
->>> nlp = pipeline('question-answering')
->>> nlp({
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
... 'question': 'What is the name of the repository ?',
... 'context': 'Pipeline have been included in the huggingface/transformers repository'
... })
@@ -568,133 +79,138 @@ You can create `Pipeline` objects for the following down-stream tasks:
```
-## Migrating from pytorch-transformers to transformers
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
-
-### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
+On top of the answer, the pretrained model used here returned its confidence score, along with the start position and its end position in the tokenized sentence. You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/transformers/task_summary.html).
-To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
-
-If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
-
-If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
-
-
-## Migrating from pytorch-pretrained-bert to transformers
+To download and use any of the pretrained models on your given task, you just need to use those three lines of codes (PyTorch version):
+```python
+>>> from transformers import AutoTokenizer, AutoModel
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
-### Models always output `tuples`
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+or for TensorFlow:
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that every model's forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
-The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
-In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
+The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on one (or list) of texts (as we can see on the fourth line of both code examples). It will output a dictionary you can directly pass to your model (which is done on the fifth line).
-Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
+The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use normally. For instance, [this tutorial](https://huggingface.co/transformers/training.html) explains how to integrate such a model in classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune the on a new dataset.
-```python
-# Let's load our model
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+## Why should I use transformers?
-# If you used to have this line in pytorch-pretrained-bert:
-loss = model(input_ids, labels=labels)
+1. Easy-to-use state-of-the-art models:
+ - High performance on NLU and NLG tasks.
+ - Low barrier to entry for educators and practitioners.
+ - Few user-facing abstractions with just three classes to learn.
+ - A unified API for using all our pretrained models.
-# Now just use this line in transformers to extract the loss from the output tuple:
-outputs = model(input_ids, labels=labels)
-loss = outputs[0]
+1. Lower compute costs, smaller carbon footprint:
+ - Researchers can share trained models instead of always retraining.
+ - Practitioners can reduce compute time and production costs.
+ - Dozens of architectures with over 2,000 pretrained models, some in more than 100 languages.
-# In transformers you can also have access to the logits:
-loss, logits = outputs[:2]
+1. Choose the right framework for every part of a model's lifetime:
+ - Train state-of-the-art models in 3 lines of code.
+ - Move a single model between TF2.0/PyTorch frameworks at will.
+ - Seamlessly pick the right framework for training, evaluation, production.
-# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
-outputs = model(input_ids, labels=labels)
-loss, logits, attentions = outputs
-```
+1. Easily customize a model or an example to your needs:
+ - Examples for each architecture to reproduce the results by the official authors of said architecture.
+ - Expose the models internal as consistently as possible.
+ - Model files can be used independently of the library for quick experiments.
-### Using hidden states
+## Why shouldn't I use transformers?
-By enabling the configuration option `output_hidden_states`, it was possible to retrieve the last hidden states of the encoder. In `pytorch-transformers` as well as `transformers` the return value has changed slightly: `all_hidden_states` now also includes the hidden state of the embeddings in addition to those of the encoding layers. This allows users to easily access the embeddings final state.
+- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving in additional abstractions/files.
+- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library.
+- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/master/examples) are just that: examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.
-### Serialization
+## Installation
-Breaking change in the `from_pretrained()` method:
+This repository is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for [examples](https://github.com/huggingface/transformers/tree/master/examples)) and TensorFlow 2.0.
-1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them, don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
+You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead, which can break derived model classes built based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the model's `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
+First, create a virtual environment with the version of Python you're going to use and activate it.
-Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
+Then, you will need to install one of, or both, TensorFlow 2.0 and PyTorch.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
-Here is an example:
+When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
-```python
-### Let's load a model and tokenizer
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-### Do some stuff to our model and tokenizer
-# Ex: add new tokens to the vocabulary and embeddings of our model
-tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
-model.resize_token_embeddings(len(tokenizer))
-# Train our model
-train(model)
-
-### Now let's save our model and tokenizer to a directory
-model.save_pretrained('./my_saved_model_directory/')
-tokenizer.save_pretrained('./my_saved_model_directory/')
-
-### Reload the model and the tokenizer
-model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
-tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
+```bash
+pip install transformers
```
-### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
-
-The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
-
-- it only implements weights decay correction,
-- schedules are now externals (see below),
-- gradient clipping is now also external (see below).
+If you'd like to play with the examples, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).
-The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
+## Models architectures
-The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
+🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/transformers/model_summary.html) for a high-level summary of each them):
-Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
+1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft Research) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
+for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
+Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
+
+
+## Learn more
-```python
-# Parameters:
-lr = 1e-3
-max_grad_norm = 1.0
-num_training_steps = 1000
-num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1
-
-### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_training_steps)
-### and used like this:
-for batch in train_data:
- loss = model(batch)
- loss.backward()
- optimizer.step()
-
-### In Transformers, optimizer and schedules are splitted and instantiated like this:
-optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
-### and used like this:
-for batch in train_data:
- model.train()
- loss = model(batch)
- loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
- optimizer.step()
- scheduler.step()
- optimizer.zero_grad()
-```
+| Section | Description |
+|-|-|
+| [Documentation](https://huggingface.co/transformers/) | Full API documentation and tutorials |
+| [Task summary](https://huggingface.co/transformers/task_summary.html) | Tasks supported by 🤗 Transformers |
+| [Preprocessing tutorial](https://huggingface.co/transformers/preprocessing.html) | Using the `Tokenizer` class to prepare data for the models |
+| [Training and fine-tuning](https://huggingface.co/transformers/training.html) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
+| [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/master/examples) | Example scripts for fine-tuning models on a wide range of tasks |
+| [Model sharing and uploading](https://huggingface.co/transformers/model_sharing.html) | Upload and share your fine-tuned models with the community |
+| [Migration](https://huggingface.co/transformers/migration.html) | Migrate to 🤗 Transformers from `pytorch-transformers` or `pytorch-pretrained-bert` |
## Citation
diff --git a/codecov.yml b/codecov.yml
deleted file mode 100644
index ecacb3725f1a9b..00000000000000
--- a/codecov.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-coverage:
- status:
- project:
- default:
- informational: true
- patch: off
-comment:
- require_changes: true # only comment if there was change in coverage
- require_head: yes # don't report if there is no head coverage report
- require_base: yes # don't report if there is no base coverage report
diff --git a/docker/transformers-gpu/Dockerfile b/docker/transformers-gpu/Dockerfile
index 6d68d2e4809757..0212eaa2a72b26 100644
--- a/docker/transformers-gpu/Dockerfile
+++ b/docker/transformers-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
LABEL maintainer="Hugging Face"
LABEL repository="transformers"
@@ -18,9 +18,14 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
tensorflow \
torch
+RUN git clone https://github.com/NVIDIA/apex
+RUN cd apex && \
+ python3 setup.py install && \
+ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
WORKDIR /workspace
COPY . transformers/
RUN cd transformers/ && \
python3 -m pip install --no-cache-dir .
-CMD ["/bin/bash"]
\ No newline at end of file
+CMD ["/bin/bash"]
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index 4beff57dc9f694..5ed2bd70fd2faa 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
LABEL maintainer="Hugging Face"
LABEL repository="transformers"
@@ -17,9 +17,14 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
mkl \
torch
+RUN git clone https://github.com/NVIDIA/apex
+RUN cd apex && \
+ python3 setup.py install && \
+ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
WORKDIR /workspace
COPY . transformers/
RUN cd transformers/ && \
python3 -m pip install --no-cache-dir .
-CMD ["/bin/bash"]
\ No newline at end of file
+CMD ["/bin/bash"]
diff --git a/docs/README.md b/docs/README.md
index 6da2f78f3abc7e..0c011ad1db7832 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -88,20 +88,25 @@ The `huggingface/transformers` documentation follows the
[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
mostly written in ReStructuredText
([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html),
-[Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html))
+[Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)).
-### Adding a new section
-A section is a page held in the `Notes` toc-tree on the documentation. Adding a new section is done in two steps:
+### Adding a new tutorial
+
+Adding a new tutorial or section is done in two steps:
- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
- Link that file in `./source/index.rst` on the correct toc-tree.
+Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
+depending on the intended targets (beginners, more advanced users or researchers) it should go in section two, three or
+four.
+
### Adding a new model
When adding a new model:
-- Create a file `xxx.rst` under `./source/model_doc`.
+- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template).
- Link that file in `./source/index.rst` on the `model_doc` toc-tree.
- Write a short overview of the model:
- Overview with paper & authors
@@ -120,18 +125,18 @@ When adding a new model:
These classes should be added using the RST syntax. Usually as follows:
```
XXXConfig
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XXXConfig
:members:
```
-This will include every public method of the configuration. If for some reason you wish for a method not to be
-displayed in the documentation, you can do so by specifying which methods should be in the docs:
+This will include every public method of the configuration that is documented. If for some reason you wish for a method
+not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
```
XXXTokenizer
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.XXXTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
@@ -142,13 +147,17 @@ XXXTokenizer
### Writing source documentation
Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as
-an object using the :obj: syntax: :obj:\`like so\`.
+an object using the :obj: syntax: :obj:\`like so\`. Note that argument names and objects like True, None or any strings
+should usually be put in `code`.
When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
-linked by Sphinx: :class:\`transformers.XXXClass\`
+linked by Sphinx: :class:\`~transformers.XXXClass\`
-When mentioning a function, it is recommended to use the :func: syntax as the mentioned method will be automatically
-linked by Sphinx: :func:\`transformers.XXXClass.method\`
+When mentioning a function, it is recommended to use the :func: syntax as the mentioned function will be automatically
+linked by Sphinx: :func:\`~transformers.function\`.
+
+When mentioning a method, it is recommended to use the :meth: syntax as the mentioned method will be automatically
+linked by Sphinx: :meth:\`~transformers.XXXClass.method\`.
Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__
@@ -165,13 +174,34 @@ Here's an example showcasing everything so far:
input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
- Indices can be obtained using :class:`transformers.AlbertTokenizer`.
- See :func:`transformers.PreTrainedTokenizer.encode` and
- :func:`transformers.PreTrainedTokenizer.__call__` for details.
+ Indices can be obtained using :class:`~transformers.AlbertTokenizer`.
+ See :meth:`~transformers.PreTrainedTokenizer.encode` and
+ :meth:`~transformers.PreTrainedTokenizer.__call__` for details.
`What are input IDs? <../glossary.html#input-ids>`__
```
+For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
+following signature:
+
+```
+def my_function(x: str = None, a: float = 1):
+```
+
+then its documentation should look like this:
+
+```
+ Args:
+ x (:obj:`str`, `optional`):
+ This argument controls ...
+ a (:obj:`float`, `optional`, defaults to 1):
+ This argument is used to ...
+```
+
+Note that we always omit the "defaults to :obj:\`None\`" when None is the default for any argument. Also note that even
+if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
+however write as many lines as you want in the indented description (see the example above with `input_ids`).
+
#### Writing a multi-line code block
Multi-line code blocks can be useful for displaying examples. They are done like so:
@@ -186,6 +216,9 @@ Example::
The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it.
+We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
+the results stay consistent with the library.
+
#### Writing a return block
Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
@@ -207,5 +240,5 @@ Here's an example for a single value return:
```
Returns:
- A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+ :obj:`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
```
diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index 081e99f1654b28..9b31a2df673c31 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -125,6 +125,12 @@ a.copybtn {
background-color: #6670FF;
}
+/* The section headers in the toc tree */
+.wy-menu-vertical p.caption{
+ background-color: #4d59ff;
+ line-height: 40px;
+}
+
/* The selected items in the toc tree */
.wy-menu-vertical li.current{
background-color: #A6B0FF;
diff --git a/docs/source/_static/js/custom.js b/docs/source/_static/js/custom.js
index f1266c4ce03373..867787d3e938d7 100644
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,15 @@
// These two things need to be updated at each release for the version selector.
// Last stable version
-const stableVersion = "v3.0.2"
+const stableVersion = "v3.5.0"
// Dictionary doc folder to label
const versionMapping = {
"master": "master",
- "": "v3.0.0/v3.0.1/v3.0.2 (stable)",
+ "": "v3.5.0/v3.5.1",
+ "v3.4.0": "v3.4.0",
+ "v3.3.1": "v3.3.0/v3.3.1",
+ "v3.2.0": "v3.2.0",
+ "v3.1.0": "v3.1.0 (stable)",
+ "v3.0.2": "v3.0.0/v3.0.1/v3.0.2",
"v2.11.0": "v2.11.0",
"v2.10.0": "v2.10.0",
"v2.9.1": "v2.9.0/v2.9.1",
@@ -233,9 +238,11 @@ function platformToggle() {
const createFrameworkButtons = sample => {
const pytorchButton = document.createElement("button");
+ pytorchButton.classList.add('pytorch-button')
pytorchButton.innerText = "PyTorch";
const tensorflowButton = document.createElement("button");
+ tensorflowButton.classList.add('tensorflow-button')
tensorflowButton.innerText = "TensorFlow";
const selectorDiv = document.createElement("div");
@@ -250,22 +257,36 @@ function platformToggle() {
tensorflowButton.classList.remove("selected");
pytorchButton.addEventListener("click", () => {
- sample.element.innerHTML = sample.pytorchSample;
- pytorchButton.classList.add("selected");
- tensorflowButton.classList.remove("selected");
+ for(const codeBlock of updatedCodeBlocks){
+ codeBlock.element.innerHTML = codeBlock.pytorchSample;
+ }
+ Array.from(document.getElementsByClassName('pytorch-button')).forEach(button => {
+ button.classList.add("selected");
+ })
+ Array.from(document.getElementsByClassName('tensorflow-button')).forEach(button => {
+ button.classList.remove("selected");
+ })
});
tensorflowButton.addEventListener("click", () => {
- sample.element.innerHTML = sample.tensorflowSample;
- tensorflowButton.classList.add("selected");
- pytorchButton.classList.remove("selected");
+ for(const codeBlock of updatedCodeBlocks){
+ codeBlock.element.innerHTML = codeBlock.tensorflowSample;
+ }
+ Array.from(document.getElementsByClassName('tensorflow-button')).forEach(button => {
+ button.classList.add("selected");
+ })
+ Array.from(document.getElementsByClassName('pytorch-button')).forEach(button => {
+ button.classList.remove("selected");
+ })
});
};
- codeBlocks
+ const updatedCodeBlocks = codeBlocks
.map(element => {return {element: element.firstChild, innerText: element.innerText}})
.filter(codeBlock => codeBlock.innerText.includes(pytorchIdentifier) && codeBlock.innerText.includes(tensorflowIdentifier))
.map(getFrameworkSpans)
- .forEach(createFrameworkButtons);
+
+ updatedCodeBlocks
+ .forEach(createFrameworkButtons)
}
diff --git a/docs/source/benchmarks.rst b/docs/source/benchmarks.rst
index 38afce66fb811d..51eedc2fd2b1d0 100644
--- a/docs/source/benchmarks.rst
+++ b/docs/source/benchmarks.rst
@@ -1,23 +1,29 @@
Benchmarks
-==========
+=======================================================================================================================
Let's take a look at how 🤗 Transformer models can be benchmarked, best practices, and already available benchmarks.
-A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found `here `__.
+A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found `here
+`__.
How to benchmark 🤗 Transformer models
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` allow to flexibly benchmark 🤗 Transformer models.
-The benchmark classes allow us to measure the `peak memory usage` and `required time` for both
-`inference` and `training`.
+The classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` allow to flexibly
+benchmark 🤗 Transformer models. The benchmark classes allow us to measure the `peak memory usage` and `required time`
+for both `inference` and `training`.
.. note::
- Hereby, `inference` is defined by a single forward pass, and `training` is defined by a single forward pass and backward pass.
+ Hereby, `inference` is defined by a single forward pass, and `training` is defined by a single forward pass and
+ backward pass.
-The benchmark classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` expect an object of type :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments`, respectively, for instantiation. :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` are data classes and contain all relevant configurations for their corresponding benchmark class.
-In the following example, it is shown how a BERT model of type `bert-base-cased` can be benchmarked.
+The benchmark classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` expect an
+object of type :class:`~transformers.PyTorchBenchmarkArguments` and
+:class:`~transformers.TensorFlowBenchmarkArguments`, respectively, for instantiation.
+:class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` are data
+classes and contain all relevant configurations for their corresponding benchmark class. In the following example, it
+is shown how a BERT model of type `bert-base-cased` can be benchmarked.
.. code-block::
@@ -34,11 +40,15 @@ In the following example, it is shown how a BERT model of type `bert-base-cased`
>>> benchmark = TensorFlowBenchmark(args)
-Here, three arguments are given to the benchmark argument data classes, namely ``models``, ``batch_sizes``, and ``sequence_lengths``. The argument ``models`` is required and expects a :obj:`list` of model identifiers from the `model hub `__
-The :obj:`list` arguments ``batch_sizes`` and ``sequence_lengths`` define the size of the ``input_ids`` on which the model is benchmarked.
-There are many more parameters that can be configured via the benchmark argument data classes. For more detail on these one can either directly consult the files
-``src/transformers/benchmark/benchmark_args_utils.py``, ``src/transformers/benchmark/benchmark_args.py`` (for PyTorch) and ``src/transformers/benchmark/benchmark_args_tf.py`` (for Tensorflow).
-Alternatively, running the following shell commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow respectively.
+Here, three arguments are given to the benchmark argument data classes, namely ``models``, ``batch_sizes``, and
+``sequence_lengths``. The argument ``models`` is required and expects a :obj:`list` of model identifiers from the
+`model hub `__ The :obj:`list` arguments ``batch_sizes`` and ``sequence_lengths`` define
+the size of the ``input_ids`` on which the model is benchmarked. There are many more parameters that can be configured
+via the benchmark argument data classes. For more detail on these one can either directly consult the files
+``src/transformers/benchmark/benchmark_args_utils.py``, ``src/transformers/benchmark/benchmark_args.py`` (for PyTorch)
+and ``src/transformers/benchmark/benchmark_args_tf.py`` (for Tensorflow). Alternatively, running the following shell
+commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow
+respectively.
.. code-block:: bash
@@ -65,7 +75,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
bert-base-uncased 8 128 0.018
bert-base-uncased 8 512 0.088
--------------------------------------------------------------------------------
-
+
==================== INFERENCE - MEMORY - RESULT ====================
--------------------------------------------------------------------------------
Model Name Batch Size Seq Length Memory in MB
@@ -75,7 +85,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
bert-base-uncased 8 128 1307
bert-base-uncased 8 512 1539
--------------------------------------------------------------------------------
-
+
==================== ENVIRONMENT INFORMATION ====================
- transformers_version: 2.11.0
- framework: PyTorch
@@ -98,7 +108,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
- gpu_power_watts: 280.0
- gpu_performance_state: 2
- use_tpu: False
-
+
>>> ## TENSORFLOW CODE
>>> results = benchmark.run()
>>> print(results)
@@ -111,7 +121,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
bert-base-uncased 8 128 0.022
bert-base-uncased 8 512 0.105
--------------------------------------------------------------------------------
-
+
==================== INFERENCE - MEMORY - RESULT ====================
--------------------------------------------------------------------------------
Model Name Batch Size Seq Length Memory in MB
@@ -121,7 +131,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
bert-base-uncased 8 128 1330
bert-base-uncased 8 512 1770
--------------------------------------------------------------------------------
-
+
==================== ENVIRONMENT INFORMATION ====================
- transformers_version: 2.11.0
- framework: Tensorflow
@@ -145,14 +155,17 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
- gpu_performance_state: 2
- use_tpu: False
-By default, the `time` and the `required memory` for `inference` are benchmarked.
-In the example output above the first two sections show the result corresponding to `inference time` and `inference memory`.
-In addition, all relevant information about the computing environment, `e.g.` the GPU type, the system, the library versions, etc... are printed out in the third section under `ENVIRONMENT INFORMATION`.
-This information can optionally be saved in a `.csv` file when adding the argument :obj:`save_to_csv=True` to :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` respectively.
-In this case, every section is saved in a separate `.csv` file. The path to each `.csv` file can optionally be defined via the argument data classes.
+By default, the `time` and the `required memory` for `inference` are benchmarked. In the example output above the first
+two sections show the result corresponding to `inference time` and `inference memory`. In addition, all relevant
+information about the computing environment, `e.g.` the GPU type, the system, the library versions, etc... are printed
+out in the third section under `ENVIRONMENT INFORMATION`. This information can optionally be saved in a `.csv` file
+when adding the argument :obj:`save_to_csv=True` to :class:`~transformers.PyTorchBenchmarkArguments` and
+:class:`~transformers.TensorFlowBenchmarkArguments` respectively. In this case, every section is saved in a separate
+`.csv` file. The path to each `.csv` file can optionally be defined via the argument data classes.
-Instead of benchmarking pre-trained models via their model identifier, `e.g.` `bert-base-uncased`, the user can alternatively benchmark an arbitrary configuration of any available model class.
-In this case, a :obj:`list` of configurations must be inserted with the benchmark args as follows.
+Instead of benchmarking pre-trained models via their model identifier, `e.g.` `bert-base-uncased`, the user can
+alternatively benchmark an arbitrary configuration of any available model class. In this case, a :obj:`list` of
+configurations must be inserted with the benchmark args as follows.
.. code-block::
@@ -183,7 +196,7 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
bert-6-lay 8 128 0.009
bert-6-lay 8 512 0.044
--------------------------------------------------------------------------------
-
+
==================== INFERENCE - MEMORY - RESULT ====================
--------------------------------------------------------------------------------
Model Name Batch Size Seq Length Memory in MB
@@ -201,7 +214,7 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
bert-6-lay 8 128 1127
bert-6-lay 8 512 1359
--------------------------------------------------------------------------------
-
+
==================== ENVIRONMENT INFORMATION ====================
- transformers_version: 2.11.0
- framework: PyTorch
@@ -252,7 +265,7 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
bert-6-lay 8 128 0.0011
bert-6-lay 8 512 0.074
--------------------------------------------------------------------------------
-
+
==================== INFERENCE - MEMORY - RESULT ====================
--------------------------------------------------------------------------------
Model Name Batch Size Seq Length Memory in MB
@@ -270,7 +283,7 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
bert-6-lay 8 128 1330
bert-6-lay 8 512 1540
--------------------------------------------------------------------------------
-
+
==================== ENVIRONMENT INFORMATION ====================
- transformers_version: 2.11.0
- framework: Tensorflow
@@ -295,28 +308,38 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
- use_tpu: False
-Again, `inference time` and `required memory` for `inference` are measured, but this time for customized configurations of the :obj:`BertModel` class. This feature can especially be helpful when
-deciding for which configuration the model should be trained.
+Again, `inference time` and `required memory` for `inference` are measured, but this time for customized configurations
+of the :obj:`BertModel` class. This feature can especially be helpful when deciding for which configuration the model
+should be trained.
Benchmark best practices
-~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
This section lists a couple of best practices one should be aware of when benchmarking a model.
-- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user
- specifies on which device the code should be run by setting the ``CUDA_VISIBLE_DEVICES`` environment variable in the shell, `e.g.` ``export CUDA_VISIBLE_DEVICES=0`` before running the code.
-- The option :obj:`no_multi_processing` should only be set to :obj:`True` for testing and debugging. To ensure accurate memory measurement it is recommended to run each memory benchmark in a separate process by making sure :obj:`no_multi_processing` is set to :obj:`True`.
-- One should always state the environment information when sharing the results of a model benchmark. Results can vary heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very useful for the community.
+- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user
+ specifies on which device the code should be run by setting the ``CUDA_VISIBLE_DEVICES`` environment variable in the
+ shell, `e.g.` ``export CUDA_VISIBLE_DEVICES=0`` before running the code.
+- The option :obj:`no_multi_processing` should only be set to :obj:`True` for testing and debugging. To ensure accurate
+ memory measurement it is recommended to run each memory benchmark in a separate process by making sure
+ :obj:`no_multi_processing` is set to :obj:`True`.
+- One should always state the environment information when sharing the results of a model benchmark. Results can vary
+ heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very
+ useful for the community.
Sharing your benchmark
-~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Previously all available core models (10 at the time) have been benchmarked for `inference time`, across many different settings: using PyTorch, with
-and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
-TensorFlow XLA) and GPUs.
+Previously all available core models (10 at the time) have been benchmarked for `inference time`, across many different
+settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. All of those tests were
+done across CPUs (except for TensorFlow XLA) and GPUs.
-The approach is detailed in the `following blogpost `__ and the results are available `here `__.
+The approach is detailed in the `following blogpost
+`__ and the results are
+available `here
+`__.
-With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community `here `__.
+With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community `here
+`__.
diff --git a/docs/source/bertology.rst b/docs/source/bertology.rst
index e1ebda78d6fc75..5e3ee5aed0002f 100644
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -1,18 +1,26 @@
BERTology
----------
+-----------------------------------------------------------------------------------------------------------------------
-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
+(that some call "BERTology"). Some good examples of this field are:
-* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
+* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
+ https://arxiv.org/abs/1905.05950
* Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
+ Manning: https://arxiv.org/abs/1906.04341
-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
+help people access the inner representations, mainly adapted from the great work of Paul Michel
+(https://arxiv.org/abs/1905.10650):
* accessing all the hidden-states of BERT/GPT/GPT-2,
* accessing all the attention weights for each head of BERT/GPT/GPT-2,
-* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
+ in https://arxiv.org/abs/1905.10650.
-To help you understand and use these features, we have added a specific example script: `bertology.py `_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: `bertology.py
+`_ while extract
+information and prune a model pre-trained on GLUE.
diff --git a/docs/source/conf.py b/docs/source/conf.py
index f2a8e16577bde6..f5de445db1b364 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@
# The short X.Y version
version = u''
# The full version, including alpha/beta/rc tags
-release = u'3.0.2'
+release = u'3.5.0'
# -- General configuration ---------------------------------------------------
diff --git a/docs/source/converting_tensorflow_models.rst b/docs/source/converting_tensorflow_models.rst
index 4151f8cf5c4d38..c1b642c5f4c86b 100644
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -1,24 +1,40 @@
Converting Tensorflow Checkpoints
-================================================
+=======================================================================================================================
-A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
+A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models
+than be loaded using the ``from_pretrained`` methods of the library.
.. note::
- Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**)
- available in any transformers >= 2.3.0 installation.
+ Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
+ transformers >= 2.3.0 installation.
The documentation below reflects the **transformers-cli convert** command format.
BERT
-^^^^
-
-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google `_\ ) in a PyTorch save file by using the `convert_bert_original_tf_checkpoint_to_pytorch.py `_ script.
-
-This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py `_\ , `run_bert_classifier.py `_ and `run_bert_squad.py `_\ ).
-
-You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
-
-To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch.
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google
+`_\ ) in a PyTorch save file by using the
+`convert_bert_original_tf_checkpoint_to_pytorch.py
+`_
+script.
+
+This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated
+configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights
+from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that
+can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py
+`_\ ,
+`run_bert_classifier.py
+`_ and
+`run_bert_squad.py `_\
+).
+
+You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
+checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\
+``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
+
+To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install
+tensorflow``\ ). The rest of the repository only requires PyTorch.
Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
@@ -31,14 +47,20 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
--config $BERT_BASE_DIR/bert_config.json \
--pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
-You can download Google's pre-trained models for the conversion `here `__.
+You can download Google's pre-trained models for the conversion `here
+`__.
ALBERT
-^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Convert TensorFlow model checkpoints of ALBERT to PyTorch using the `convert_albert_original_tf_checkpoint_to_pytorch.py `_ script.
+Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
+`convert_albert_original_tf_checkpoint_to_pytorch.py
+`_
+script.
-The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you will need to have TensorFlow and PyTorch installed.
+The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying
+configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you
+will need to have TensorFlow and PyTorch installed.
Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:
@@ -51,12 +73,15 @@ Here is an example of the conversion process for the pre-trained ``ALBERT Base``
--config $ALBERT_BASE_DIR/albert_config.json \
--pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
-You can download Google's pre-trained models for the conversion `here `__.
+You can download Google's pre-trained models for the conversion `here
+`__.
OpenAI GPT
-^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here `__\ )
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
+save as the same format than OpenAI pretrained model (see `here `__\
+)
.. code-block:: shell
@@ -70,9 +95,10 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
OpenAI GPT-2
-^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here `__\ )
+Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here
+`__\ )
.. code-block:: shell
@@ -85,9 +111,10 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
[--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
Transformer-XL
-^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here `__\ )
+Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here
+`__\ )
.. code-block:: shell
@@ -101,7 +128,7 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
XLNet
-^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here is an example of the conversion process for a pre-trained XLNet model:
@@ -118,7 +145,7 @@ Here is an example of the conversion process for a pre-trained XLNet model:
XLM
-^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Here is an example of the conversion process for a pre-trained XLM model:
@@ -130,4 +157,4 @@ Here is an example of the conversion process for a pre-trained XLM model:
--tf_checkpoint $XLM_CHECKPOINT_PATH \
--pytorch_dump_output $PYTORCH_DUMP_OUTPUT
[--config XML_CONFIG] \
- [--finetuning_task_name XML_FINETUNED_TASK]
\ No newline at end of file
+ [--finetuning_task_name XML_FINETUNED_TASK]
diff --git a/docs/source/custom_datasets.rst b/docs/source/custom_datasets.rst
index fd8b05aaeed38b..495fd3391282f2 100644
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -1,17 +1,17 @@
Fine-tuning with custom datasets
-================================
+=======================================================================================================================
.. note::
- The datasets used in this tutorial are available and can be more easily accessed using the
- `🤗 NLP library `_. We do not use this library to access the datasets here
- since this tutorial meant to illustrate how to work with your own data. A brief of introduction can be found
- at the end of the tutorial in the section ":ref:`nlplib`".
+ The datasets used in this tutorial are available and can be more easily accessed using the `🤗 NLP library
+ `_. We do not use this library to access the datasets here since this tutorial
+ meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the tutorial
+ in the section ":ref:`nlplib`".
-This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The
-guide shows one of many valid workflows for using these models and is meant to be illustrative rather than
-definitive. We show examples of reading in several data formats, preprocessing the data for several types of tasks,
-and then preparing the data into PyTorch/TensorFlow ``Dataset`` objects which can easily be used either with
+This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The guide
+shows one of many valid workflows for using these models and is meant to be illustrative rather than definitive. We
+show examples of reading in several data formats, preprocessing the data for several types of tasks, and then preparing
+the data into PyTorch/TensorFlow ``Dataset`` objects which can easily be used either with
:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow.
We include several examples, each of which demonstrates a different type of common downstream task:
@@ -24,17 +24,17 @@ We include several examples, each of which demonstrates a different type of comm
.. _seq_imdb:
Sequence Classification with IMDb Reviews
------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
.. note::
- This dataset can be explored in the Hugging Face model hub (`IMDb `_), and can
- be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.
+ This dataset can be explored in the Hugging Face model hub (`IMDb `_), and
+ can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.
-In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task
-takes the text of a review and requires the model to predict whether the sentiment of the review is positive or
-negative. Let's start by downloading the dataset from the
-`Large Movie Review Dataset `_ webpage.
+In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task takes
+the text of a review and requires the model to predict whether the sentiment of the review is positive or negative.
+Let's start by downloading the dataset from the `Large Movie Review Dataset
+`_ webpage.
.. code-block:: bash
@@ -62,9 +62,8 @@ read this in.
train_texts, train_labels = read_imdb_split('aclImdb/train')
test_texts, test_labels = read_imdb_split('aclImdb/test')
-We now have a train and test dataset, but let's also also create a validation set which we can use for for
-evaluation and tuning without training our test set results. Sklearn has a convenient utility for creating such
-splits:
+We now have a train and test dataset, but let's also also create a validation set which we can use for for evaluation
+and tuning without training our test set results. Sklearn has a convenient utility for creating such splits:
.. code-block:: python
@@ -80,8 +79,8 @@ pre-trained DistilBert, so let's use the DistilBert tokenizer.
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
Now we can simply pass our texts to the tokenizer. We'll pass ``truncation=True`` and ``padding=True``, which will
-ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum
-input length. This will allow us to feed batches of sequences into the model at the same time.
+ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum input
+length. This will allow us to feed batches of sequences into the model at the same time.
.. code-block:: python
@@ -90,9 +89,9 @@ input length. This will allow us to feed batches of sequences into the model at
test_encodings = tokenizer(test_texts, truncation=True, padding=True)
Now, let's turn our labels and encodings into a Dataset object. In PyTorch, this is done by subclassing a
-``torch.utils.data.Dataset`` object and implementing ``__len__`` and ``__getitem__``. In TensorFlow, we pass our input encodings and
-labels to the ``from_tensor_slices`` constructor method. We put the data in this format so that the data can be
-easily batched such that each key in the batch encoding corresponds to a named parameter of the
+``torch.utils.data.Dataset`` object and implementing ``__len__`` and ``__getitem__``. In TensorFlow, we pass our input
+encodings and labels to the ``from_tensor_slices`` constructor method. We put the data in this format so that the data
+can be easily batched such that each key in the batch encoding corresponds to a named parameter of the
:meth:`~transformers.DistilBertForSequenceClassification.forward` method of the model we will train.
.. code-block:: python
@@ -133,17 +132,17 @@ easily batched such that each key in the batch encoding corresponds to a named p
))
Now that our datasets our ready, we can fine-tune a model either with the 🤗
-:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow. See
-:doc:`training `.
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow. See :doc:`training
+`.
.. _ft_trainer:
Fine-tuning with Trainer
-~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a
-model to fine-tune, define the :class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments`
-and instantiate a :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`.
+The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a model
+to fine-tune, define the :class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` and
+instantiate a :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`.
.. code-block:: python
@@ -200,7 +199,7 @@ and instantiate a :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer
.. _ft_native:
Fine-tuning with native PyTorch/TensorFlow
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
We can also train use native PyTorch or TensorFlow:
@@ -244,19 +243,19 @@ We can also train use native PyTorch or TensorFlow:
.. _tok_ner:
Token Classification with W-NUT Emerging Entities
--------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
.. note::
- This dataset can be explored in the Hugging Face model hub (`WNUT-17 `_), and can
- be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.
+ This dataset can be explored in the Hugging Face model hub (`WNUT-17 `_),
+ and can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.
Next we will look at token classification. Rather than classifying an entire sequence, this task classifies token by
-token. We'll demonstrate how to do this with
-`Named Entity Recognition `_, which involves
-identifying tokens which correspond to a predefined set of "entities". Specifically, we'll use the
-`W-NUT Emerging and Rare entities `_ corpus. The data
-is given as a collection of pre-tokenized documents where each token is assigned a tag.
+token. We'll demonstrate how to do this with `Named Entity Recognition
+`_, which involves identifying tokens which correspond to
+a predefined set of "entities". Specifically, we'll use the `W-NUT Emerging and Rare entities
+`_ corpus. The data is given as a collection of
+pre-tokenized documents where each token is assigned a tag.
Let's start by downloading the data.
@@ -264,10 +263,10 @@ Let's start by downloading the data.
wget http://noisy-text.github.io/2017/files/wnut17train.conll
-In this case, we'll just download the train set, which is a single text file. Each line of the file contains either
-(1) a word and tag separated by a tab, or (2) a blank line indicating the end of a document. Let's write a
-function to read this in. We'll take in the file path and return ``token_docs`` which is a list of lists of token
-strings, and ``token_tags`` which is a list of lists of tag strings.
+In this case, we'll just download the train set, which is a single text file. Each line of the file contains either (1)
+a word and tag separated by a tab, or (2) a blank line indicating the end of a document. Let's write a function to read
+this in. We'll take in the file path and return ``token_docs`` which is a list of lists of token strings, and
+``token_tags`` which is a list of lists of tag strings.
.. code-block:: python
@@ -290,11 +289,11 @@ strings, and ``token_tags`` which is a list of lists of tag strings.
tags.append(tag)
token_docs.append(tokens)
tag_docs.append(tags)
-
+
return token_docs, tag_docs
-
+
texts, tags = read_wnut('wnut17train.conll')
-
+
Just to see what this data looks like, let's take a look at a segment of the first document.
.. code-block:: python
@@ -303,8 +302,8 @@ Just to see what this data looks like, let's take a look at a segment of the fir
['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']
-``location`` is an entity type, ``B-`` indicates the beginning of an entity, and ``I-`` indicates consecutive positions of
-the same entity ("Empire State Building" is considered one entity). ``O`` indicates the token does not correspond to
+``location`` is an entity type, ``B-`` indicates the beginning of an entity, and ``I-`` indicates consecutive positions
+of the same entity ("Empire State Building" is considered one entity). ``O`` indicates the token does not correspond to
any entity.
Now that we've read the data in, let's create a train/validation split:
@@ -314,8 +313,8 @@ Now that we've read the data in, let's create a train/validation split:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)
-Next, let's create encodings for our tokens and tags. For the tags, we can start by just create a simple mapping
-which we'll use in a moment:
+Next, let's create encodings for our tokens and tags. For the tags, we can start by just create a simple mapping which
+we'll use in a moment:
.. code-block:: python
@@ -323,42 +322,42 @@ which we'll use in a moment:
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}
-To encode the tokens, we'll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we're dealing
-with ready-split tokens rather than full sentence strings by passing ``is_pretokenized=True``. We'll also pass
-``padding=True`` and ``truncation=True`` to pad the sequences to be the same length. Lastly, we can tell the model
-to return information about the tokens which are split by the wordpiece tokenization process, which we will need in
-a moment.
+To encode the tokens, we'll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we're dealing with
+ready-split tokens rather than full sentence strings by passing ``is_split_into_words=True``. We'll also pass
+``padding=True`` and ``truncation=True`` to pad the sequences to be the same length. Lastly, we can tell the model to
+return information about the tokens which are split by the wordpiece tokenization process, which we will need in a
+moment.
.. code-block:: python
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
- train_encodings = tokenizer(train_texts, is_pretokenized=True, return_offsets_mapping=True, padding=True, truncation=True)
- val_encodings = tokenizer(val_texts, is_pretokenized=True, return_offsets_mapping=True, padding=True, truncation=True)
+ train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
+ val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
Great, so now our tokens are nicely encoded in the format that they need to be in to feed them into our DistilBert
model below.
-Now we arrive at a common obstacle with using pre-trained models for token-level classification: many of the tokens
-in the W-NUT corpus are not in DistilBert's vocabulary. Bert and many models like it use a method called WordPiece
-Tokenization, meaning that single words are split into multiple tokens such that each token is likely to be in
-the vocabulary. For example, DistilBert's tokenizer would split the Twitter handle ``@huggingface`` into the tokens
-``['@', 'hugging', '##face']``. This is a problem for us because we have exactly one tag per token. If the tokenizer
-splits a token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.
+Now we arrive at a common obstacle with using pre-trained models for token-level classification: many of the tokens in
+the W-NUT corpus are not in DistilBert's vocabulary. Bert and many models like it use a method called WordPiece
+Tokenization, meaning that single words are split into multiple tokens such that each token is likely to be in the
+vocabulary. For example, DistilBert's tokenizer would split the Twitter handle ``@huggingface`` into the tokens ``['@',
+'hugging', '##face']``. This is a problem for us because we have exactly one tag per token. If the tokenizer splits a
+token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.
-One way to handle this is to only train on the tag labels for the first subtoken of a split token. We can do this in
-🤗 Transformers by setting the labels we wish to ignore to ``-100``. In the example above, if the label for
+One way to handle this is to only train on the tag labels for the first subtoken of a split token. We can do this in 🤗
+Transformers by setting the labels we wish to ignore to ``-100``. In the example above, if the label for
``@HuggingFace`` is ``3`` (indexing ``B-corporation``), we would set the labels of ``['@', 'hugging', '##face']`` to
``[3, -100, -100]``.
Let's write a function to do this. This is where we will use the ``offset_mapping`` from the tokenizer as mentioned
above. For each sub-token returned by the tokenizer, the offset mapping gives us a tuple indicating the sub-token's
-start position and end position relative to the original token it was split from. That means that if the first
-position in the tuple is anything other than ``0``, we will set its corresponding label to ``-100``. While we're at
-it, we can also set labels to ``-100`` if the second position of the offset mapping is ``0``, since this means it must
-be a special token like ``[PAD]`` or ``[CLS]``.
+start position and end position relative to the original token it was split from. That means that if the first position
+in the tuple is anything other than ``0``, we will set its corresponding label to ``-100``. While we're at it, we can
+also set labels to ``-100`` if the second position of the offset mapping is ``0``, since this means it must be a
+special token like ``[PAD]`` or ``[CLS]``.
-.. note::
+.. note::
Due to a recently fixed bug, -1 must be used instead of -100 when using TensorFlow in 🤗 Transformers <= 3.02.
@@ -379,7 +378,7 @@ be a special token like ``[PAD]`` or ``[CLS]``.
encoded_labels.append(doc_enc_labels.tolist())
return encoded_labels
-
+
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
@@ -443,12 +442,13 @@ sequence classification example above.
.. _qa_squad:
Question Answering with SQuAD 2.0
----------------------------------
+-----------------------------------------------------------------------------------------------------------------------
.. note::
- This dataset can be explored in the Hugging Face model hub (`SQuAD V2 `_), and can
- be alternatively downloaded with the 🤗 NLP library with ``load_dataset("squad_v2")``.
+ This dataset can be explored in the Hugging Face model hub (`SQuAD V2
+ `_), and can be alternatively downloaded with the 🤗 NLP library with
+ ``load_dataset("squad_v2")``.
Question answering comes in many forms. In this example, we'll look at the particular type of extractive QA that
involves answering a question about a passage by highlighting the segment of the passage that answers the question.
@@ -464,8 +464,8 @@ We will start by downloading the data:
wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json
Each split is in a structured json file with a number of questions and answers for each passage (or context). We'll
-take this apart into parallel lists of contexts, questions, and answers (note that the contexts here are repeated
-since there are multiple questions per context):
+take this apart into parallel lists of contexts, questions, and answers (note that the contexts here are repeated since
+there are multiple questions per context):
.. code-block:: python
@@ -491,17 +491,17 @@ since there are multiple questions per context):
answers.append(answer)
return contexts, questions, answers
-
+
train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')
-The contexts and questions are just strings. The answers are dicts containing the subsequence of the passage with
-the correct answer as well as an integer indicating the character at which the answer begins. In order to train a
-model on this data we need (1) the tokenized context/question pairs, and (2) integers indicating at which *token*
-positions the answer begins and ends.
+The contexts and questions are just strings. The answers are dicts containing the subsequence of the passage with the
+correct answer as well as an integer indicating the character at which the answer begins. In order to train a model on
+this data we need (1) the tokenized context/question pairs, and (2) integers indicating at which *token* positions the
+answer begins and ends.
-First, let's get the *character* position at which the answer ends in the passage (we are given the starting
-position). Sometimes SQuAD answers are off by one or two characters, so we will also adjust for that.
+First, let's get the *character* position at which the answer ends in the passage (we are given the starting position).
+Sometimes SQuAD answers are off by one or two characters, so we will also adjust for that.
.. code-block:: python
@@ -510,7 +510,7 @@ position). Sometimes SQuAD answers are off by one or two characters, so we will
gold_text = answer['text']
start_idx = answer['answer_start']
end_idx = start_idx + len(gold_text)
-
+
# sometimes squad answers are off by a character or two – fix this
if context[start_idx:end_idx] == gold_text:
answer['answer_end'] = end_idx
@@ -524,9 +524,9 @@ position). Sometimes SQuAD answers are off by one or two characters, so we will
add_end_idx(train_answers, train_contexts)
add_end_idx(val_answers, val_contexts)
-Now ``train_answers`` and ``val_answers`` include the character end positions and the corrected start positions.
-Next, let's tokenize our context/question pairs. 🤗 Tokenizers can accept parallel lists of sequences and encode
-them together as sequence pairs.
+Now ``train_answers`` and ``val_answers`` include the character end positions and the corrected start positions. Next,
+let's tokenize our context/question pairs. 🤗 Tokenizers can accept parallel lists of sequences and encode them together
+as sequence pairs.
.. code-block:: python
@@ -536,8 +536,8 @@ them together as sequence pairs.
train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
-Next we need to convert our character start/end positions to token start/end positions. When using 🤗 Fast
-Tokenizers, we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method.
+Next we need to convert our character start/end positions to token start/end positions. When using 🤗 Fast Tokenizers,
+we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method.
.. code-block:: python
@@ -557,9 +557,9 @@ Tokenizers, we can use the built in :func:`~transformers.BatchEncoding.char_to_t
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)
-Our data is ready. Let's just put it in a PyTorch/TensorFlow dataset so that we can easily use it for
-training. In PyTorch, we define a custom ``Dataset`` class. In TensorFlow, we pass a tuple of
-``(inputs_dict, labels_dict)`` to the ``from_tensor_slices`` method.
+Our data is ready. Let's just put it in a PyTorch/TensorFlow dataset so that we can easily use it for training. In
+PyTorch, we define a custom ``Dataset`` class. In TensorFlow, we pass a tuple of ``(inputs_dict, labels_dict)`` to the
+``from_tensor_slices`` method.
.. code-block:: python
@@ -575,7 +575,7 @@ training. In PyTorch, we define a custom ``Dataset`` class. In TensorFlow, we pa
def __len__(self):
return len(self.encodings.input_ids)
-
+
train_dataset = SquadDataset(train_encodings)
val_dataset = SquadDataset(val_encodings)
## TENSORFLOW CODE
@@ -655,7 +655,7 @@ multiple model outputs.
.. _resources:
Additional Resources
---------------------
+-----------------------------------------------------------------------------------------------------------------------
- `How to train a new language model from scratch using Transformers and Tokenizers
`_. Blog post showing the steps to load in Esperanto data and train a
@@ -666,14 +666,13 @@ Additional Resources
.. _nlplib:
Using the 🤗 NLP Datasets & Metrics library
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with
-🤗 Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the
-`🤗 NLP library `_ for working with the 150+ datasets included in the
-`hub `_, including the three datasets used in this tutorial. As a very brief overview,
-we will show how to use the NLP library to download and prepare the IMDb dataset from the first example,
-:ref:`seq_imdb`.
+This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with 🤗
+Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the `🤗
+NLP library `_ for working with the 150+ datasets included in the `hub
+`_, including the three datasets used in this tutorial. As a very brief overview, we
+will show how to use the NLP library to download and prepare the IMDb dataset from the first example, :ref:`seq_imdb`.
Start by downloading the dataset:
@@ -689,8 +688,8 @@ Each dataset has multiple columns corresponding to different features. Let's see
>>> print(train.column_names)
['label', 'text']
-Great. Now let's tokenize the text. We can do this using the ``map`` method. We'll also rename the ``label`` column
-to ``labels`` to match the model's input arguments.
+Great. Now let's tokenize the text. We can do this using the ``map`` method. We'll also rename the ``label`` column to
+``labels`` to match the model's input arguments.
.. code-block:: python
@@ -711,5 +710,5 @@ dataset elements.
>>> {key: val.shape for key, val in train[0].items()})
{'labels': TensorShape([]), 'input_ids': TensorShape([512]), 'attention_mask': TensorShape([512])}
-We now have a fully-prepared dataset. Check out `the 🤗 NLP docs `_ for
-a more thorough introduction.
\ No newline at end of file
+We now have a fully-prepared dataset. Check out `the 🤗 NLP docs `_ for a
+more thorough introduction.
diff --git a/docs/source/glossary.rst b/docs/source/glossary.rst
index 43355778b79dad..3b902623e31e6c 100644
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -1,8 +1,8 @@
Glossary
-^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
General terms
--------------
+-----------------------------------------------------------------------------------------------------------------------
- autoencoding models: see MLM
- autoregressive models: see CLM
@@ -27,7 +27,7 @@ General terms
or a punctuation symbol.
Model inputs
-------------
+-----------------------------------------------------------------------------------------------------------------------
Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
detailed here alongside usage examples.
@@ -35,7 +35,7 @@ detailed here alongside usage examples.
.. _input-ids:
Input IDs
-~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
numerical representations of tokens building the sequences that will be used as input by the model*.
@@ -43,7 +43,7 @@ numerical representations of tokens building the sequences that will be used as
Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
tokenizer, which is a `WordPiece `__ tokenizer:
-::
+.. code-block::
>>> from transformers import BertTokenizer
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
@@ -52,31 +52,31 @@ tokenizer, which is a `WordPiece `__ token
The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
-::
+.. code-block::
>>> tokenized_sequence = tokenizer.tokenize(sequence)
The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
-in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix is
-added for "RA" and "M":
+in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
+is added for "RA" and "M":
-::
+.. code-block::
>>> print(tokenized_sequence)
['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
-the sentence to the tokenizer, which leverages the Rust implementation of
-`huggingface/tokenizers `__ for peak performance.
+the sentence to the tokenizer, which leverages the Rust implementation of `huggingface/tokenizers
+`__ for peak performance.
-::
+.. code-block::
>>> inputs = tokenizer(sequence)
The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
token indices are under the key "input_ids":
-::
+.. code-block::
>>> encoded_sequence = inputs["input_ids"]
>>> print(encoded_sequence)
@@ -87,13 +87,13 @@ IDs the model sometimes uses.
If we decode the previous sequence of ids,
-::
+.. code-block::
>>> decoded_sequence = tokenizer.decode(encoded_sequence)
we will see
-::
+.. code-block::
>>> print(decoded_sequence)
[CLS] A Titan RTX has 24GB of VRAM [SEP]
@@ -103,14 +103,14 @@ because this is the way a :class:`~transformers.BertModel` is going to expect it
.. _attention-mask:
Attention mask
-~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The attention mask is an optional argument used when batching sequences together. This argument indicates to the
-model which tokens should be attended to, and which should not.
+The attention mask is an optional argument used when batching sequences together. This argument indicates to the model
+which tokens should be attended to, and which should not.
For example, consider these two sequences:
-::
+.. code-block::
>>> from transformers import BertTokenizer
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
@@ -123,34 +123,34 @@ For example, consider these two sequences:
The encoded versions have different lengths:
-::
+.. code-block::
>>> len(encoded_sequence_a), len(encoded_sequence_b)
(8, 19)
-Therefore, we can't be put then together in a same tensor as-is. The first sequence needs to be padded up to the length
+Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
of the second one, or the second one needs to be truncated down to the length of the first one.
In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
it to pad like this:
-::
+.. code-block::
>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
-::
+.. code-block::
>>> padded_sequences["input_ids"]
[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
-This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
-the position of the padded indices so that the model does not attend to them. For the
-:class:`~transformers.BertTokenizer`, :obj:`1` indicates a value that should be attended to, while :obj:`0` indicates
-a padded value. This attention mask is in the dictionary returned by the tokenizer under the key "attention_mask":
+This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
+position of the padded indices so that the model does not attend to them. For the :class:`~transformers.BertTokenizer`,
+:obj:`1` indicates a value that should be attended to, while :obj:`0` indicates a padded value. This attention mask is
+in the dictionary returned by the tokenizer under the key "attention_mask":
-::
+.. code-block::
>>> padded_sequences["attention_mask"]
[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
@@ -158,20 +158,21 @@ a padded value. This attention mask is in the dictionary returned by the tokeniz
.. _token-type-ids:
Token Type IDs
-~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Some models' purpose is to do sequence classification or question answering. These require two different sequences to
-be joined in a single "input_ids" entry, which usually is performed with the help of special tokens, such as the classifier (``[CLS]``) and separator (``[SEP]``)
-tokens. For example, the BERT model builds its two sequence input as such:
+be joined in a single "input_ids" entry, which usually is performed with the help of special tokens, such as the
+classifier (``[CLS]``) and separator (``[SEP]``) tokens. For example, the BERT model builds its two sequence input as
+such:
-::
+.. code-block::
>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
-We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two arguments (and
-not a list, like before) like this:
+We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
+arguments (and not a list, like before) like this:
-::
+.. code-block::
>>> from transformers import BertTokenizer
>>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
@@ -183,18 +184,18 @@ not a list, like before) like this:
which will return:
-::
+.. code-block::
>>> print(decoded)
[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
This is enough for some models to understand where one sequence ends and where another begins. However, other models,
-such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary
-mask identifying the two types of sequence in the model.
+such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
+the two types of sequence in the model.
The tokenizer returns this mask as the "token_type_ids" entry:
-::
+.. code-block::
>>> encoded_dict['token_type_ids']
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
@@ -207,35 +208,80 @@ Some models, like :class:`~transformers.XLNetModel` use an additional token repr
.. _position-ids:
Position IDs
-~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Contrary to RNNs that have the position of each token embedded within them,
-transformers are unaware of the position of each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in the list of tokens.
+Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
+each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in
+the list of tokens.
-They are an optional parameter. If no ``position_ids`` is passed to the model, the IDs are automatically created as absolute
-positional embeddings.
+They are an optional parameter. If no ``position_ids`` is passed to the model, the IDs are automatically created as
+absolute positional embeddings.
-Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
-use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models use
+other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+
+.. _labels:
+
+Labels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
+should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
+predictions and the expected value (the label).
+
+These labels are different according to the model head, for example:
+
+- For sequence classification models (e.g., :class:`~transformers.BertForSequenceClassification`), the model expects a
+ tensor of dimension :obj:`(batch_size)` with each value of the batch corresponding to the expected label of the
+ entire sequence.
+- For token classification models (e.g., :class:`~transformers.BertForTokenClassification`), the model expects a tensor
+ of dimension :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual
+ token.
+- For masked language modeling (e.g., :class:`~transformers.BertForMaskedLM`), the model expects a tensor of dimension
+ :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
+ labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
+- For sequence to sequence tasks,(e.g., :class:`~transformers.BartForConditionalGeneration`,
+ :class:`~transformers.MBartForConditionalGeneration`), the model expects a tensor of dimension :obj:`(batch_size,
+ tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
+ training, both `BART` and `T5` will make the appropriate `decoder_input_ids` and decoder attention masks internally.
+ They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
+ the documentation of each model for more information on each specific model's labels.
+
+The base models (e.g., :class:`~transformers.BertModel`) do not accept labels, as these are the base transformer
+models, simply outputting features.
+
+.. _decoder-input-ids:
+
+Decoder input IDs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
+inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
+way specific to each model.
+
+Most encoder-decoder models (BART, T5) create their :obj:`decoder_input_ids` on their own from the :obj:`labels`. In
+such models, passing the :obj:`labels` is the preferred way to handle training.
+
+Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
.. _feed-forward-chunking:
Feed Forward Chunking
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
-The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g.,
-for ``bert-base-uncased``).
+The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
+``bert-base-uncased``).
For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward
embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory
use. The authors of `Reformer: The Efficient Transformer `_ noticed that since the
computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output
embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``
-individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with
-``n = sequence_length``, which trades increased computation time against reduced memory use, but yields a
-mathematically **equivalent** result.
+individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n =
+sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically
+**equivalent** result.
For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the
number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
-complexity. If ``chunk_size`` is set to 0, no feed forward chunking is done.
+complexity. If ``chunk_size`` is set to 0, no feed forward chunking is done.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 9d0ea1fc5b4805..1c70c98584a438 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,17 +1,17 @@
Transformers
-================================================================================================================================================
+=======================================================================================================================
State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0.
-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose
-architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural
-Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose
+architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural
+Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
TensorFlow 2.0 and PyTorch.
This is the documentation of our repository `transformers `_.
Features
----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
- High performance on NLU and NLG tasks
- Low barrier to entry for educators and practitioners
@@ -36,7 +36,7 @@ Choose the right framework for every part of a model's lifetime:
- Seamlessly pick the right framework for training, evaluation, production
Contents
----------------------------------
+-----------------------------------------------------------------------------------------------------------------------
The documentation is organized in five parts:
@@ -46,90 +46,125 @@ The documentation is organized in five parts:
- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general resarch in
transformers model
-- **PACKAGE REFERENCE** contains the documentation of each public class and function.
+- The three last section contain the documentation of each public class and function, grouped in:
+
+ - **MAIN CLASSES** for the main classes exposing the important APIs of the library.
+ - **MODELS** for the classes and functions related to each model implemented in the library.
+ - **INTERNAL HELPERS** for the classes and functions we use internally.
The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and
conversion utilities for the following models:
-1. `BERT `_ (from Google) released with the paper `BERT: Pre-training of Deep
- Bidirectional Transformers for Language Understanding `_ by Jacob Devlin, Ming-Wei
- Chang, Kenton Lee, and Kristina Toutanova.
-2. `GPT `_ (from OpenAI) released with the paper `Improving Language
- Understanding by Generative Pre-Training `_ by Alec Radford, Karthik
- Narasimhan, Tim Salimans, and Ilya Sutskever.
-3. `GPT-2 `_ (from OpenAI) released with the paper `Language Models are
- Unsupervised Multitask Learners `_ by Alec Radford, Jeffrey Wu,
- Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever.
-4. `Transformer-XL `_ (from Google/CMU) released with the paper
- `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context `_ by
- Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, and Ruslan Salakhutdinov.
-5. `XLNet `_ (from Google/CMU) released with the paper `XLNet: Generalized
- Autoregressive Pretraining for Language Understanding `_ by Zhilin Yang, Zihang
- Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, and Quoc V. Le.
-6. `XLM `_ (from Facebook) released together with the paper `Cross-lingual
- Language Model Pretraining `_ by Guillaume Lample and Alexis Conneau.
-7. `RoBERTa `_ (from Facebook), released together with
- the paper a `Robustly Optimized BERT Pretraining Approach `_ by Yinhan Liu, Myle
- Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin
- Stoyanov.
-8. `DistilBERT `_ (from HuggingFace) released together
- with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter
- `_ by Victor Sanh, Lysandre Debut, and Thomas Wolf. The same method has been
- applied to compress GPT2 into
- `DistilGPT2 `_.
-9. `CTRL `_ (from Salesforce), released together with the
- paper `CTRL: A Conditional Transformer Language Model for Controllable Generation
- `_ by Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, Caiming Xiong,
- and Richard Socher.
-10. `CamemBERT `_ (from FAIR, Inria, Sorbonne Université)
- released together with the paper `CamemBERT: a Tasty French Language Model `_ by
- Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la
- Clergerie, Djame Seddah, and Benoît Sagot.
-11. `ALBERT `_ (from Google Research), released together with the paper
- `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations `_
- by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut.
-12. `T5 `_ (from Google) released with the paper
- `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
- `_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
- Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu.
-13. `XLM-RoBERTa `_ (from Facebook AI), released together
- with the paper `Unsupervised Cross-lingual Representation Learning at Scale `_ by
- Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard
- Grave, Myle Ott, Luke Zettlemoyer, and Veselin Stoyanov.
-14. `MMBT `_ (from Facebook), released together with the paper a `Supervised
- Multimodal Bitransformers for Classifying Images and Text `_ by Douwe Kiela,
- Suvrat Bhooshan, Hamed Firooz, and Davide Testuggine.
-15. `FlauBERT `_ (from CNRS) released with the paper `FlauBERT: Unsupervised
- Language Model Pre-training for French `_ by Hang Le, Loïc Vial, Jibril Frej,
- Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, and
- Didier Schwab.
-16. `BART `_ (from Facebook) released with the paper
- `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
- `_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
- Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
-17. `ELECTRA `_ (from Google Research/Stanford University) released with
- the paper `ELECTRA: Pre-training text encoders as discriminators rather than generators
- `_ by Kevin Clark, Minh-Thang Luong, Quoc V. Le, and Christopher D. Manning.
-18. `DialoGPT `_ (from Microsoft Research) released with the paper `DialoGPT:
- Large-Scale Generative Pre-training for Conversational Response Generation `_ by
- Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu,
- and Bill Dolan.
-19. `Reformer `_ (from Google Research) released with
- the paper `Reformer: The Efficient Transformer `_ by Nikita Kitaev, Łukasz
- Kaiser, and Anselm Levskaya.
-20. `MarianMT `_ (developed by the Microsoft Translator Team) machine translation models
- trained using `OPUS `_ pretrained_models data by Jörg Tiedemann.
-21. `Longformer `_ (from AllenAI) released with the paper `Longformer: The
- Long-Document Transformer `_ by Iz Beltagy, Matthew E. Peters, and Arman Cohan.
-22. `DPR `_ (from Facebook) released with the paper `Dense Passage Retrieval
- for Open-Domain Question Answering `_ by Vladimir Karpukhin, Barlas Oğuz, Sewon
- Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-23. `Pegasus `_ (from Google) released with the paper `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization
- `_ by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-24. `MBart `_ (from Facebook) released with the paper `Multilingual Denoising Pre-training for Neural Machine Translation `_ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov,
- Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-25. `Other community models `_, contributed by the `community
- `_.
+..
+ This list is updated automatically from the README with `make fix-copies`. Do not update manually!
+
+1. :doc:`ALBERT ` (from Google Research and the Toyota Technological Institute at Chicago) released
+ with the paper `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations
+ `__, by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush
+ Sharma, Radu Soricut.
+2. :doc:`BART ` (from Facebook) released with the paper `BART: Denoising Sequence-to-Sequence
+ Pre-training for Natural Language Generation, Translation, and Comprehension
+ `__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
+ Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+3. :doc:`BERT ` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
+ Transformers for Language Understanding `__ by Jacob Devlin, Ming-Wei Chang,
+ Kenton Lee and Kristina Toutanova.
+4. :doc:`BERT For Sequence Generation ` (from Google) released with the paper `Leveraging
+ Pre-trained Checkpoints for Sequence Generation Tasks `__ by Sascha Rothe, Shashi
+ Narayan, Aliaksei Severyn.
+5. :doc:`Blenderbot ` (from Facebook) released with the paper `Recipes for building an
+ open-domain chatbot `__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
+ Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+6. :doc:`CamemBERT ` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+ French Language Model `__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
+ Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+7. :doc:`CTRL ` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+ Model for Controllable Generation `__ by Nitish Shirish Keskar*, Bryan McCann*,
+ Lav R. Varshney, Caiming Xiong and Richard Socher.
+8. :doc:`DeBERTa ` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
+ BERT with Disentangled Attention `__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
+ Weizhu Chen.
+9. :doc:`DialoGPT ` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+ Generative Pre-training for Conversational Response Generation `__ by Yizhe Zhang,
+ Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+10. :doc:`DistilBERT ` (from HuggingFace), released together with the paper `DistilBERT, a
+ distilled version of BERT: smaller, faster, cheaper and lighter `__ by Victor
+ Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
+ `__, RoBERTa into `DistilRoBERTa
+ `__, Multilingual BERT into
+ `DistilmBERT `__ and a German
+ version of DistilBERT.
+11. :doc:`DPR ` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+ Question Answering `__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
+ Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+12. :doc:`ELECTRA ` (from Google Research/Stanford University) released with the paper `ELECTRA:
+ Pre-training text encoders as discriminators rather than generators `__ by Kevin
+ Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+13. :doc:`FlauBERT ` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+ Pre-training for French `__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
+ Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+14. :doc:`Funnel Transformer ` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+ Filtering out Sequential Redundancy for Efficient Language Processing `__ by
+ Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+15. :doc:`GPT ` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+ Pre-Training `__ by Alec Radford, Karthik Narasimhan, Tim Salimans
+ and Ilya Sutskever.
+16. :doc:`GPT-2 ` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+ Learners `__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
+ Luan, Dario Amodei** and Ilya Sutskever**.
+17. :doc:`LayoutLM ` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+ of Text and Layout for Document Image Understanding `__ by Yiheng Xu, Minghao Li,
+ Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+18. :doc:`Longformer ` (from AllenAI) released with the paper `Longformer: The Long-Document
+ Transformer `__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+19. :doc:`LXMERT ` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+ Encoder Representations from Transformers for Open-Domain Question Answering `__
+ by Hao Tan and Mohit Bansal.
+20. :doc:`MarianMT ` Machine translation models trained using `OPUS `__ data by
+ Jörg Tiedemann. The `Marian Framework `__ is being developed by the Microsoft
+ Translator Team.
+21. :doc:`MBart ` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+ Neural Machine Translation `__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
+ Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+22. :doc:`MT5 ` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+ text-to-text transformer `__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
+ Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+23. :doc:`Pegasus ` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+ Gap-sentences for Abstractive Summarization `__> by Jingqing Zhang, Yao Zhao,
+ Mohammad Saleh and Peter J. Liu.
+24. :doc:`ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+ Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan, Weizhen Qi,
+ Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+25. :doc:`Reformer ` (from Google Research) released with the paper `Reformer: The Efficient
+ Transformer `__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+26. :doc:`RoBERTa ` (from Facebook), released together with the paper a `Robustly Optimized BERT
+ Pretraining Approach `__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
+ Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. ultilingual BERT into `DistilmBERT
+ `__ and a German version of
+ DistilBERT.
+27. :doc:`SqueezeBert ` released with the paper `SqueezeBERT: What can computer vision teach NLP
+ about efficient neural networks? `__ by Forrest N. Iandola, Albert E. Shaw, Ravi
+ Krishna, and Kurt W. Keutzer.
+28. :doc:`T5 ` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+ Unified Text-to-Text Transformer `__ by Colin Raffel and Noam Shazeer and Adam
+ Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+29. :doc:`Transformer-XL ` (from Google/CMU) released with the paper `Transformer-XL:
+ Attentive Language Models Beyond a Fixed-Length Context `__ by Zihang Dai*,
+ Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+30. :doc:`XLM ` (from Facebook) released together with the paper `Cross-lingual Language Model
+ Pretraining `__ by Guillaume Lample and Alexis Conneau.
+31. :doc:`XLM-ProphetNet ` (from Microsoft Research) released with the paper `ProphetNet:
+ Predicting Future N-gram for Sequence-to-Sequence Pre-training `__ by Yu Yan,
+ Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+32. :doc:`XLM-RoBERTa ` (from Facebook AI), released together with the paper `Unsupervised
+ Cross-lingual Representation Learning at Scale `__ by Alexis Conneau*, Kartikay
+ Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
+ Zettlemoyer and Veselin Stoyanov.
+33. :doc:`XLNet ` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+ Pretraining for Language Understanding `__ by Zhilin Yang*, Zihang Dai*, Yiming
+ Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+34. `Other community models `__, contributed by the `community
+ `__.
.. toctree::
:maxdepth: 2
@@ -163,6 +198,7 @@ conversion utilities for the following models:
converting_tensorflow_models
migration
contributing
+ testing
serialization
.. toctree::
@@ -175,43 +211,69 @@ conversion utilities for the following models:
.. toctree::
:maxdepth: 2
- :caption: Package Reference
+ :caption: Main Classes
+ main_classes/callback
main_classes/configuration
- main_classes/output
+ main_classes/logging
main_classes/model
- main_classes/tokenizer
- main_classes/pipelines
- main_classes/trainer
main_classes/optimizer_schedules
+ main_classes/output
+ main_classes/pipelines
main_classes/processors
+ main_classes/tokenizer
+ main_classes/trainer
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Models
+
+ model_doc/albert
model_doc/auto
- model_doc/encoderdecoder
+ model_doc/bart
model_doc/bert
- model_doc/gpt
- model_doc/transformerxl
- model_doc/gpt2
- model_doc/xlm
- model_doc/xlnet
- model_doc/roberta
- model_doc/distilbert
- model_doc/ctrl
+ model_doc/bertgeneration
+ model_doc/blenderbot
model_doc/camembert
- model_doc/albert
- model_doc/xlmroberta
- model_doc/flaubert
- model_doc/bart
- model_doc/t5
- model_doc/electra
+ model_doc/ctrl
+ model_doc/deberta
model_doc/dialogpt
- model_doc/reformer
- model_doc/marian
+ model_doc/distilbert
+ model_doc/dpr
+ model_doc/electra
+ model_doc/encoderdecoder
+ model_doc/flaubert
+ model_doc/fsmt
+ model_doc/funnel
+ model_doc/layoutlm
model_doc/longformer
- model_doc/retribert
+ model_doc/lxmert
+ model_doc/marian
+ model_doc/mbart
model_doc/mobilebert
- model_doc/dpr
+ model_doc/mt5
+ model_doc/gpt
+ model_doc/gpt2
model_doc/pegasus
- model_doc/mbart
+ model_doc/prophetnet
+ model_doc/rag
+ model_doc/reformer
+ model_doc/retribert
+ model_doc/roberta
+ model_doc/squeezebert
+ model_doc/t5
+ model_doc/transformerxl
+ model_doc/xlm
+ model_doc/xlmprophetnet
+ model_doc/xlmroberta
+ model_doc/xlnet
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Internal Helpers
+
internal/modeling_utils
- internal/tokenization_utils
internal/pipelines_utils
+ internal/tokenization_utils
+ internal/trainer_utils
+ internal/generation_utils
diff --git a/docs/source/installation.md b/docs/source/installation.md
index 793d07a306a0dd..8e5a37af4b8df3 100644
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -37,13 +37,13 @@ pip install transformers[tf-cpu]
To check 🤗 Transformers is properly installed, run the following command:
```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
```
It should download a pretrained model then print something like
```bash
-[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
```
(Note that TensorFlow will print additional stuff before that last statement.)
@@ -80,9 +80,9 @@ cache home followed by ``/transformers/`` (even if you don't have PyTorch instal
So if you don't have any specific environment variable set, the cache directory will be at
``~/.cache/torch/transformers/``.
-**Note:** If you have set a shell enviromnent variable for one of the predecessors of this library
+**Note:** If you have set a shell environment variable for one of the predecessors of this library
(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
-enviromnent variable for ``TRANSFORMERS_CACHE``.
+environment variable for ``TRANSFORMERS_CACHE``.
### Note on model downloads (Continuous Integration or large-scale deployments)
diff --git a/docs/source/internal/generation_utils.rst b/docs/source/internal/generation_utils.rst
new file mode 100644
index 00000000000000..9496827a5e16a4
--- /dev/null
+++ b/docs/source/internal/generation_utils.rst
@@ -0,0 +1,50 @@
+Utilities for Generation
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions used by :meth:`~transformers.PretrainedModel.generate`,
+:meth:`~transformers.PretrainedModel.greedy_search`, :meth:`~transformers.PretrainedModel.sample`,
+:meth:`~transformers.PretrainedModel.beam_search`, and :meth:`~transformers.PretrainedModel.beam_sample`.
+
+Most of those are only useful if you are studying the code of the generate methods in the library.
+
+LogitsProcessor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A :class:`~transformers.LogitsProcessor` can be used to modify the prediction scores of a language model head for
+generation.
+
+.. autoclass:: transformers.LogitsProcessor
+ :members: __call__
+
+.. autoclass:: transformers.LogitsProcessorList
+ :members: __call__
+
+.. autoclass:: transformers.MinLengthLogitsProcessor
+ :members: __call__
+
+.. autoclass:: transformers.TemperatureLogitsWarper
+ :members: __call__
+
+.. autoclass:: transformers.RepetitionPenaltyLogitsProcessor
+ :members: __call__
+
+.. autoclass:: transformers.TopPLogitsWarper
+ :members: __call__
+
+.. autoclass:: transformers.TopKLogitsWarper
+ :members: __call__
+
+.. autoclass:: transformers.NoRepeatNGramLogitsProcessor
+ :members: __call__
+
+.. autoclass:: transformers.NoBadWordsLogitsProcessor
+ :members: __call__
+
+BeamSearch
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeamScorer
+ :members: process, finalize
+
+.. autoclass:: transformers.BeamSearchScorer
+ :members: process, finalize
diff --git a/docs/source/internal/modeling_utils.rst b/docs/source/internal/modeling_utils.rst
index 9e7fb6b11c8910..59f5cb768bb198 100644
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@@ -1,13 +1,13 @@
Custom Layers and Utilities
----------------------------
+-----------------------------------------------------------------------------------------------------------------------
This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
Most of those are only useful if you are studying the code of the models in the library.
-``Pytorch custom modules``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+Pytorch custom modules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_utils.Conv1D
@@ -29,8 +29,8 @@ Most of those are only useful if you are studying the code of the models in the
:members: forward
-``PyTorch Helper Functions``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+PyTorch Helper Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: transformers.apply_chunking_to_forward
@@ -42,8 +42,8 @@ Most of those are only useful if you are studying the code of the models in the
.. autofunction:: transformers.modeling_utils.prune_linear_layer
-``TensorFlow custom layers``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TensorFlow custom layers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_tf_utils.TFConv1D
@@ -54,8 +54,8 @@ Most of those are only useful if you are studying the code of the models in the
:members: call
-``TensorFlow loss functions``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TensorFlow loss functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_tf_utils.TFCausalLanguageModelingLoss
:members:
@@ -76,8 +76,8 @@ Most of those are only useful if you are studying the code of the models in the
:members:
-``TensorFlow Helper Functions``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TensorFlow Helper Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autofunction:: transformers.modeling_tf_utils.cast_bool_to_primitive
@@ -85,4 +85,4 @@ Most of those are only useful if you are studying the code of the models in the
.. autofunction:: transformers.modeling_tf_utils.keras_serializable
-.. autofunction:: transformers.modeling_tf_utils.shape_list
\ No newline at end of file
+.. autofunction:: transformers.modeling_tf_utils.shape_list
diff --git a/docs/source/internal/pipelines_utils.rst b/docs/source/internal/pipelines_utils.rst
index c6fda75803c291..1e33551af7b039 100644
--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@@ -1,40 +1,40 @@
-Utilities for pipelines
------------------------
-
-This page lists all the utility functions the library provides for pipelines.
-
-Most of those are only useful if you are studying the code of the models in the library.
-
-
-Argument handling
-~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.pipelines.ArgumentHandler
-
-.. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler
-
-.. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler
-
-
-Data format
-~~~~~~~~~~~
-
-.. autoclass:: transformers.pipelines.PipelineDataFormat
- :members:
-
-.. autoclass:: transformers.pipelines.CsvPipelineDataFormat
- :members:
-
-.. autoclass:: transformers.pipelines.JsonPipelineDataFormat
- :members:
-
-.. autoclass:: transformers.pipelines.PipedPipelineDataFormat
- :members:
-
-
-Utilities
-~~~~~~~~~
-
-.. autofunction:: transformers.pipelines.get_framework
-
-.. autoclass:: transformers.pipelines.PipelineException
+Utilities for pipelines
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions the library provides for pipelines.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+Argument handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.pipelines.ArgumentHandler
+
+.. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler
+
+.. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler
+
+
+Data format
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.pipelines.PipelineDataFormat
+ :members:
+
+.. autoclass:: transformers.pipelines.CsvPipelineDataFormat
+ :members:
+
+.. autoclass:: transformers.pipelines.JsonPipelineDataFormat
+ :members:
+
+.. autoclass:: transformers.pipelines.PipedPipelineDataFormat
+ :members:
+
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.pipelines.get_framework
+
+.. autoclass:: transformers.pipelines.PipelineException
diff --git a/docs/source/internal/tokenization_utils.rst b/docs/source/internal/tokenization_utils.rst
index 48752c8de26107..ac861306306f13 100644
--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@@ -1,38 +1,39 @@
-Utilities for Tokenizers
-------------------------
-
-This page lists all the utility functions used by the tokenizers, mainly the class
-:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between
-:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin
-:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
-
-Most of those are only useful if you are studying the code of the tokenizers in the library.
-
-``PreTrainedTokenizerBase``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase
- :special-members: __call__
- :members:
-
-
-``SpecialTokensMixin``
-~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin
- :members:
-
-
-Enums and namedtuples
-~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
-
-.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
-
-.. autoclass:: transformers.tokenization_utils_base.TensorType
-
-.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
-
-.. autoclass:: transformers.tokenization_utils_base.CharSpan
-
-.. autoclass:: transformers.tokenization_utils_base.TokenSpan
+Utilities for Tokenizers
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions used by the tokenizers, mainly the class
+:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between
+:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin
+:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
+
+Most of those are only useful if you are studying the code of the tokenizers in the library.
+
+PreTrainedTokenizerBase
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase
+ :special-members: __call__
+ :members:
+
+
+SpecialTokensMixin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin
+ :members:
+
+
+Enums and namedtuples
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
+
+.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.TensorType
+
+.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.CharSpan
+
+.. autoclass:: transformers.tokenization_utils_base.TokenSpan
diff --git a/docs/source/internal/trainer_utils.rst b/docs/source/internal/trainer_utils.rst
new file mode 100644
index 00000000000000..4afbfa0adbe7e1
--- /dev/null
+++ b/docs/source/internal/trainer_utils.rst
@@ -0,0 +1,27 @@
+Utilities for Trainer
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions used by :class:`~transformers.Trainer`.
+
+Most of those are only useful if you are studying the code of the Trainer in the library.
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EvalPrediction
+
+.. autofunction:: transformers.set_seed
+
+.. autofunction:: transformers.torch_distributed_zero_first
+
+
+Callbacks internals
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.trainer_callback.CallbackHandler
+
+Distributed Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer
+ :members:
diff --git a/docs/source/main_classes/callback.rst b/docs/source/main_classes/callback.rst
new file mode 100644
index 00000000000000..f146244c1fd9aa
--- /dev/null
+++ b/docs/source/main_classes/callback.rst
@@ -0,0 +1,75 @@
+Callbacks
+-----------------------------------------------------------------------------------------------------------------------
+
+Callbacks are objects that can customize the behavior of the training loop in the PyTorch
+:class:`~transformers.Trainer` (this feature is not yet implemented in TensorFlow) that can inspect the training loop
+state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
+stopping).
+
+Callbacks are "read only" pieces of code, apart from the :class:`~transformers.TrainerControl` object they return, they
+cannot change anything in the training loop. For customizations that require changes in the training loop, you should
+subclass :class:`~transformers.Trainer` and override the methods you need (see :doc:`trainer` for examples).
+
+By default a :class:`~transformers.Trainer` will use the following callbacks:
+
+- :class:`~transformers.DefaultFlowCallback` which handles the default behavior for logging, saving and evaluation.
+- :class:`~transformers.PrinterCallback` or :class:`~transformers.ProgressCallback` to display progress and print the
+ logs (the first one is used if you deactivate tqdm through the :class:`~transformers.TrainingArguments`, otherwise
+ it's the second one).
+- :class:`~transformers.integrations.TensorBoardCallback` if tensorboard is accessible (either through PyTorch >= 1.4
+ or tensorboardX).
+- :class:`~transformers.integrations.WandbCallback` if `wandb `__ is installed.
+- :class:`~transformers.integrations.CometCallback` if `comet_ml `__ is installed.
+- :class:`~transformers.integrations.MLflowCallback` if `mlflow `__ is installed.
+- :class:`~transformers.integrations.AzureMLCallback` if `azureml-sdk `__ is
+ installed.
+
+The main class that implements callbacks is :class:`~transformers.TrainerCallback`. It gets the
+:class:`~transformers.TrainingArguments` used to instantiate the :class:`~transformers.Trainer`, can access that
+Trainer's internal state via :class:`~transformers.TrainerState`, and can take some actions on the training loop via
+:class:`~transformers.TrainerControl`.
+
+
+Available Callbacks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here is the list of the available :class:`~transformers.TrainerCallback` in the library:
+
+.. autoclass:: transformers.integrations.CometCallback
+ :members: setup
+
+.. autoclass:: transformers.DefaultFlowCallback
+
+.. autoclass:: transformers.PrinterCallback
+
+.. autoclass:: transformers.ProgressCallback
+
+.. autoclass:: transformers.integrations.TensorBoardCallback
+
+.. autoclass:: transformers.integrations.WandbCallback
+ :members: setup
+
+.. autoclass:: transformers.integrations.MLflowCallback
+ :members: setup
+
+.. autoclass:: transformers.integrations.AzureMLCallback
+
+TrainerCallback
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerCallback
+ :members:
+
+
+TrainerState
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerState
+ :members:
+
+
+TrainerControl
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerControl
+ :members:
diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst
index 03e31fe5804c98..04db915c06449c 100644
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -1,12 +1,13 @@
Configuration
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
-The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a
-local file or directory, or from a pretrained model configuration provided by the library (downloaded from
-HuggingFace's AWS S3 repository).
+The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration
+either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
+from HuggingFace's AWS S3 repository).
-``PretrainedConfig``
-~~~~~~~~~~~~~~~~~~~~~
+
+PretrainedConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.PretrainedConfig
:members:
diff --git a/docs/source/main_classes/logging.rst b/docs/source/main_classes/logging.rst
new file mode 100644
index 00000000000000..f382c992d089cb
--- /dev/null
+++ b/docs/source/main_classes/logging.rst
@@ -0,0 +1,58 @@
+Logging
+-----------------------------------------------------------------------------------------------------------------------
+
+🤗 Transformers has a centralized logging system, so that you can setup the verbosity of the library easily.
+
+Currently the default verbosity of the library is ``WARNING``.
+
+To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity
+to the INFO level.
+
+.. code-block:: python
+
+ import transformers
+ transformers.logging.set_verbosity_info()
+
+You can also use the environment variable ``TRANSFORMERS_VERBOSITY`` to override the default verbosity. You can set it
+to one of the following: ``debug``, ``info``, ``warning``, ``error``, ``critical``. For example:
+
+.. code-block:: bash
+
+ TRANSFORMERS_VERBOSITY=error ./myprogram.py
+
+All the methods of this logging module are documented below, the main ones are
+:func:`transformers.logging.get_verbosity` to get the current level of verbosity in the logger and
+:func:`transformers.logging.set_verbosity` to set the verbosity to the level of your choice. In order (from the least
+verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
+
+- :obj:`transformers.logging.CRITICAL` or :obj:`transformers.logging.FATAL` (int value, 50): only report the most
+ critical errors.
+- :obj:`transformers.logging.ERROR` (int value, 40): only report errors.
+- :obj:`transformers.logging.WARNING` or :obj:`transformers.logging.WARN` (int value, 30): only reports error and
+ warnings. This the default level used by the library.
+- :obj:`transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
+- :obj:`transformers.logging.DEBUG` (int value, 10): report all information.
+
+Base setters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.logging.set_verbosity_error
+
+.. autofunction:: transformers.logging.set_verbosity_warning
+
+.. autofunction:: transformers.logging.set_verbosity_info
+
+.. autofunction:: transformers.logging.set_verbosity_debug
+
+Other functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.logging.get_verbosity
+
+.. autofunction:: transformers.logging.set_verbosity
+
+.. autofunction:: transformers.logging.get_logger
+
+.. autofunction:: transformers.logging.enable_explicit_format
+
+.. autofunction:: transformers.logging.reset_format
diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst
index f6500438b110a5..668b10176f75b8 100644
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -1,5 +1,5 @@
Models
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
The base classes :class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` implement the
common methods for loading/saving a model either from a local file or directory, or from a pretrained model
@@ -17,39 +17,39 @@ for text generation, :class:`~transformers.generation_utils.GenerationMixin` (fo
:class:`~transformers.generation_tf_utils.TFGenerationMixin` (for the TensorFlow models)
-``PreTrainedModel``
-~~~~~~~~~~~~~~~~~~~~~
+PreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.PreTrainedModel
:members:
-``ModuleUtilsMixin``
-~~~~~~~~~~~~~~~~~~~~
+ModuleUtilsMixin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_utils.ModuleUtilsMixin
:members:
-``TFPreTrainedModel``
-~~~~~~~~~~~~~~~~~~~~~
+TFPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFPreTrainedModel
:members:
-``TFModelUtilsMixin``
-~~~~~~~~~~~~~~~~~~~~~
+TFModelUtilsMixin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_tf_utils.TFModelUtilsMixin
:members:
-Generative models
-~~~~~~~~~~~~~~~~~
+Generation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.generation_utils.GenerationMixin
:members:
.. autoclass:: transformers.generation_tf_utils.TFGenerationMixin
- :members:
\ No newline at end of file
+ :members:
diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst
index 998100075eb057..149e745425ad2a 100644
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -1,5 +1,5 @@
Optimization
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
The ``.optimization`` module provides:
@@ -7,29 +7,29 @@ The ``.optimization`` module provides:
- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
- a gradient accumulation class to accumulate the gradients of multiple batches
-``AdamW`` (PyTorch)
-~~~~~~~~~~~~~~~~~~~
+AdamW (PyTorch)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AdamW
:members:
-``AdaFactor`` (PyTorch)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AdaFactor (PyTorch)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Adafactor
-``AdamWeightDecay`` (TensorFlow)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AdamWeightDecay (TensorFlow)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AdamWeightDecay
.. autofunction:: transformers.create_optimizer
Schedules
-~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Learning Rate Schedules (Pytorch)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autofunction:: transformers.get_constant_schedule
@@ -62,16 +62,16 @@ Learning Rate Schedules (Pytorch)
:target: /imgs/warmup_linear_schedule.png
:alt:
-``Warmup`` (TensorFlow)
-^^^^^^^^^^^^^^^^^^^^^^^
+Warmup (TensorFlow)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: transformers.WarmUp
:members:
Gradient Strategies
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-``GradientAccumulator`` (TensorFlow)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+GradientAccumulator (TensorFlow)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. autoclass:: transformers.GradientAccumulator
diff --git a/docs/source/main_classes/output.rst b/docs/source/main_classes/output.rst
index fe43c8e59b1d4f..5ccd29209094d8 100644
--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -1,5 +1,5 @@
Model outputs
--------------
+-----------------------------------------------------------------------------------------------------------------------
PyTorch models have outputs that are instances of subclasses of :class:`~transformers.file_utils.ModelOutput`. Those
are data structures containing all the information returned by the model, but that can also be used as tuples or
@@ -44,98 +44,253 @@ values. Here for instance, it has two keys that are ``loss`` and ``logits``.
We document here the generic model outputs that are used by more than one model type. Specific output types are
documented on their corresponding model page.
-``ModelOutput``
-~~~~~~~~~~~~~~~
+ModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.file_utils.ModelOutput
:members:
-``BaseModelOutput``
-~~~~~~~~~~~~~~~~~~~
+
+BaseModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.BaseModelOutput
:members:
-``BaseModelOutputWithPooling``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+BaseModelOutputWithPooling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPooling
:members:
-``BaseModelOutputWithPast``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+BaseModelOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithCrossAttentions
+ :members:
+
+
+BaseModelOutputWithPoolingAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
+ :members:
+
+
+BaseModelOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPast
:members:
-``Seq2SeqModelOutput``
-~~~~~~~~~~~~~~~~~~~~~~
+
+BaseModelOutputWithPastAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
+ :members:
+
+
+Seq2SeqModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.Seq2SeqModelOutput
:members:
-``CausalLMOutput``
-~~~~~~~~~~~~~~~~~~
+
+CausalLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.CausalLMOutput
:members:
-``CausalLMOutputWithPast``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+CausalLMOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithCrossAttentions
+ :members:
+
+
+CausalLMOutputWithPastAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithPastAndCrossAttentions
+ :members:
+
+
+CausalLMOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithPast
:members:
-``MaskedLMOutput``
-~~~~~~~~~~~~~~~~~~
+
+MaskedLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.MaskedLMOutput
:members:
-``Seq2SeqLMOutput``
-~~~~~~~~~~~~~~~~~~~
+
+Seq2SeqLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.Seq2SeqLMOutput
:members:
-``NextSentencePredictorOutput``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+NextSentencePredictorOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.NextSentencePredictorOutput
:members:
-``SequenceClassifierOutput``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+SequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.SequenceClassifierOutput
:members:
-``Seq2SeqSequenceClassifierOutput``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Seq2SeqSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.Seq2SeqSequenceClassifierOutput
:members:
-``MultipleChoiceModelOutput``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+MultipleChoiceModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.MultipleChoiceModelOutput
:members:
-``TokenClassifierOutput``
-~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TokenClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.TokenClassifierOutput
:members:
-``QuestionAnsweringModelOutput``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+QuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.QuestionAnsweringModelOutput
:members:
-``Seq2SeqQuestionAnsweringModelOutput``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Seq2SeqQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
:members:
+
+
+TFBaseModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutput
+ :members:
+
+
+TFBaseModelOutputWithPooling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPooling
+ :members:
+
+
+TFBaseModelOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPast
+ :members:
+
+
+TFSeq2SeqModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqModelOutput
+ :members:
+
+
+TFCausalLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFCausalLMOutput
+ :members:
+
+
+TFCausalLMOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFCausalLMOutputWithPast
+ :members:
+
+
+TFMaskedLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFMaskedLMOutput
+ :members:
+
+
+TFSeq2SeqLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqLMOutput
+ :members:
+
+
+TFNextSentencePredictorOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFNextSentencePredictorOutput
+ :members:
+
+
+TFSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSequenceClassifierOutput
+ :members:
+
+
+TFSeq2SeqSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
+ :members:
+
+
+TFMultipleChoiceModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFMultipleChoiceModelOutput
+ :members:
+
+
+TFTokenClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFTokenClassifierOutput
+ :members:
+
+
+TFQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFQuestionAnsweringModelOutput
+ :members:
+
+
+TFSeq2SeqQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
+ :members:
diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst
index 6bcbd399e11649..e67c6e2e922d99 100644
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -1,8 +1,8 @@
Pipelines
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
-The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most
-of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
+The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of
+the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
:doc:`task summary <../task_summary>` for examples of use.
@@ -21,21 +21,22 @@ There are two categories of pipeline abstractions to be aware about:
- :class:`~transformers.TokenClassificationPipeline`
- :class:`~transformers.TranslationPipeline`
- :class:`~transformers.ZeroShotClassificationPipeline`
+ - :class:`~transformers.Text2TextGenerationPipeline`
The pipeline abstraction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any
-other pipeline but requires an additional argument which is the `task`.
+The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
+pipeline but requires an additional argument which is the `task`.
.. autofunction:: transformers.pipeline
The task specific pipelines
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ConversationalPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.Conversation
@@ -44,70 +45,76 @@ ConversationalPipeline
:members:
FeatureExtractionPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.FeatureExtractionPipeline
:special-members: __call__
:members:
FillMaskPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.FillMaskPipeline
:special-members: __call__
:members:
NerPipeline
-==========================================
+=======================================================================================================================
This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined below. Please refer to that
pipeline for documentation and usage examples.
QuestionAnsweringPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.QuestionAnsweringPipeline
:special-members: __call__
:members:
SummarizationPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.SummarizationPipeline
:special-members: __call__
:members:
TextClassificationPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.TextClassificationPipeline
:special-members: __call__
:members:
TextGenerationPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.TextGenerationPipeline
:special-members: __call__
:members:
+Text2TextGenerationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.Text2TextGenerationPipeline
+ :special-members: __call__
+ :members:
+
TokenClassificationPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.TokenClassificationPipeline
:special-members: __call__
:members:
ZeroShotClassificationPipeline
-==========================================
+=======================================================================================================================
.. autoclass:: transformers.ZeroShotClassificationPipeline
:special-members: __call__
:members:
-
Parent class: :obj:`Pipeline`
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.Pipeline
:members:
diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst
index 0e318eff077822..4f852cd918d108 100644
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -1,15 +1,15 @@
Processors
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
This library includes processors for several traditional tasks. These processors can be used to process a dataset into
examples that can be fed to a model.
Processors
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
All processors follow the same architecture which is that of the
-:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
-of :class:`~transformers.data.processors.utils.InputExample`. These
+:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list of
+:class:`~transformers.data.processors.utils.InputExample`. These
:class:`~transformers.data.processors.utils.InputExample` can be converted to
:class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
@@ -26,16 +26,18 @@ of :class:`~transformers.data.processors.utils.InputExample`. These
GLUE
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-`General Language Understanding Evaluation (GLUE) `__ is a benchmark that evaluates
-the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
-`GLUE: A multi-task benchmark and analysis platform for natural language understanding `__
+`General Language Understanding Evaluation (GLUE) `__ is a benchmark that evaluates the
+performance of models across a diverse set of existing NLU tasks. It was released together with the paper `GLUE: A
+multi-task benchmark and analysis platform for natural language understanding
+`__
-This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
-CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
+This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB,
+QQP, QNLI, RTE and WNLI.
Those processors are:
+
- :class:`~transformers.data.processors.utils.MrpcProcessor`
- :class:`~transformers.data.processors.utils.MnliProcessor`
- :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
@@ -46,51 +48,55 @@ Those processors are:
- :class:`~transformers.data.processors.utils.RteProcessor`
- :class:`~transformers.data.processors.utils.WnliProcessor`
-Additionally, the following method can be used to load values from a data file and convert them to a list of
+Additionally, the following method can be used to load values from a data file and convert them to a list of
:class:`~transformers.data.processors.utils.InputExample`.
.. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
Example usage
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-An example using these processors is given in the `run_glue.py `__ script.
+An example using these processors is given in the `run_glue.py
+`__ script.
XNLI
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-`The Cross-Lingual NLI Corpus (XNLI) `__ is a benchmark that evaluates
-the quality of cross-lingual text representations.
-XNLI is crowd-sourced dataset based on `MultiNLI `: pairs of text are labeled with textual entailment
-annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+`The Cross-Lingual NLI Corpus (XNLI) `__ is a benchmark that evaluates the
+quality of cross-lingual text representations. XNLI is crowd-sourced dataset based on `MultiNLI
+`: pairs of text are labeled with textual entailment annotations for 15
+different languages (including both high-resource language such as English and low-resource languages such as Swahili).
-It was released together with the paper
-`XNLI: Evaluating Cross-lingual Sentence Representations `__
+It was released together with the paper `XNLI: Evaluating Cross-lingual Sentence Representations
+`__
This library hosts the processor to load the XNLI data:
+
- :class:`~transformers.data.processors.utils.XnliProcessor`
Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
-An example using these processors is given in the
-`run_xnli.py `__ script.
+An example using these processors is given in the `run_xnli.py
+`__ script.
SQuAD
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-`The Stanford Question Answering Dataset (SQuAD) `__ is a benchmark that evaluates
-the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
-`SQuAD: 100,000+ Questions for Machine Comprehension of Text `__. The second version (v2.0) was released alongside
-the paper `Know What You Don't Know: Unanswerable Questions for SQuAD `__.
+`The Stanford Question Answering Dataset (SQuAD) `__ is a benchmark that
+evaluates the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version
+(v1.1) was released together with the paper `SQuAD: 100,000+ Questions for Machine Comprehension of Text
+`__. The second version (v2.0) was released alongside the paper `Know What You Don't
+Know: Unanswerable Questions for SQuAD `__.
This library hosts a processor for each of the two versions:
Processors
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Those processors are:
+
- :class:`~transformers.data.processors.utils.SquadV1Processor`
- :class:`~transformers.data.processors.utils.SquadV2Processor`
@@ -99,20 +105,21 @@ They both inherit from the abstract class :class:`~transformers.data.processors.
.. autoclass:: transformers.data.processors.squad.SquadProcessor
:members:
-Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
-that can be used as model inputs.
+Additionally, the following method can be used to convert SQuAD examples into
+:class:`~transformers.data.processors.utils.SquadFeatures` that can be used as model inputs.
.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
-These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
-Examples are given below.
+These processors as well as the aforementionned method can be used with files containing the data as well as with the
+`tensorflow_datasets` package. Examples are given below.
Example usage
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
Here is an example using the processors as well as the conversion method using data files:
-Example::
+.. code-block::
# Loading a V2 processor
processor = SquadV2Processor()
@@ -133,7 +140,7 @@ Example::
Using `tensorflow_datasets` is as easy as using a data file:
-Example::
+.. code-block::
# tensorflow_datasets only handle Squad V1.
tfds_examples = tfds.load("squad")
@@ -149,5 +156,5 @@ Example::
)
-Another example using these processors is given in the
-`run_squad.py `__ script.
+Another example using these processors is given in the `run_squad.py
+`__ script.
diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst
index a15e516df3e3c8..ed458c6cf2b994 100644
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -1,5 +1,5 @@
Tokenizer
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most
of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the
@@ -29,31 +29,32 @@ methods for using all the tokenizers:
:class:`~transformers.BatchEncoding` holds the output of the tokenizer's encoding methods (``__call__``,
``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python
-tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by these
-methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by HuggingFace
-`tokenizers library `__), this class provides in addition several advanced
-alignment methods which can be used to map between the original string (character and words) and the token space (e.g.,
-getting the index of the token comprising a given character or the span of characters corresponding to a given token).
+tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
+these methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
+HuggingFace `tokenizers library `__), this class provides in addition
+several advanced alignment methods which can be used to map between the original string (character and words) and the
+token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
+to a given token).
-``PreTrainedTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~
+PreTrainedTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.PreTrainedTokenizer
:special-members: __call__
:members:
-``PreTrainedTokenizerFast``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+PreTrainedTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.PreTrainedTokenizerFast
:special-members: __call__
:members:
-``BatchEncoding``
-~~~~~~~~~~~~~~~~~~~~~~~~
+BatchEncoding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BatchEncoding
:members:
diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index 55b308a74e0788..12fff9d518a007 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -1,62 +1,72 @@
-Trainer
-----------
-
-The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete
-training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`.
-
-Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a
-:class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of
-customization during training.
-
-The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
-`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
-
-Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop supporting the
-previous features. To inject custom behavior you can subclass them and override the following methods:
-
-- **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset.
-- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaulation DataLoader (PyTorch) or TF Dataset.
-- **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
-- **log** -- Logs information on the various objects watching training.
-- **setup_wandb** -- Setups wandb (see `here `__ for more information).
-- **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at
- init.
-- **training_step** -- Performs a training step.
-- **prediction_step** -- Performs an evaluation/test step.
-- **run_model** (TensorFlow only) -- Basic pass through the model.
-- **evaluate** -- Runs an evaluation loop and returns metrics.
-- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
-
-
-``Trainer``
-~~~~~~~~~~~
-
-.. autoclass:: transformers.Trainer
- :members:
-
-``TFTrainer``
-~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFTrainer
- :members:
-
-``TrainingArguments``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TrainingArguments
- :members:
-
-``TFTrainingArguments``
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TFTrainingArguments
- :members:
-
-Utilities
-~~~~~~~~~
-
-.. autoclass:: transformers.EvalPrediction
-
-.. autofunction:: transformers.set_seed
-
-.. autofunction:: transformers.torch_distributed_zero_first
+Trainer
+-----------------------------------------------------------------------------------------------------------------------
+
+The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete
+training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`.
+
+Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a
+:class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of
+customization during training.
+
+The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
+`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
+
+Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop supporting the
+previous features. To inject custom behavior you can subclass them and override the following methods:
+
+- **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset.
+- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaluation DataLoader (PyTorch) or TF Dataset.
+- **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
+- **log** -- Logs information on the various objects watching training.
+- **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at
+ init.
+- **compute_loss** - Computes the loss on a batch of training inputs.
+- **training_step** -- Performs a training step.
+- **prediction_step** -- Performs an evaluation/test step.
+- **run_model** (TensorFlow only) -- Basic pass through the model.
+- **evaluate** -- Runs an evaluation loop and returns metrics.
+- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
+
+Here is an example of how to customize :class:`~transformers.Trainer` using a custom loss function:
+
+.. code-block:: python
+
+ from transformers import Trainer
+ class MyTrainer(Trainer):
+ def compute_loss(self, model, inputs):
+ labels = inputs.pop("labels")
+ outputs = model(**inputs)
+ logits = outputs[0]
+ return my_custom_loss(logits, labels)
+
+Another way to customize the training loop behavior for the PyTorch :class:`~transformers.Trainer` is to use
+:doc:`callbacks ` that can inspect the training loop state (for progress reporting, logging on TensorBoard or
+other ML platforms...) and take decisions (like early stopping).
+
+
+Trainer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Trainer
+ :members:
+
+
+TFTrainer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTrainer
+ :members:
+
+
+TrainingArguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainingArguments
+ :members:
+
+
+TFTrainingArguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTrainingArguments
+ :members:
diff --git a/docs/source/migration.md b/docs/source/migration.md
index 0cf53e1feaef2c..f3b1b55b54490d 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -20,7 +20,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt
The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
+The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
@@ -109,7 +109,7 @@ for batch in train_data:
loss.backward()
optimizer.step()
-### In 🤗 Transformers, optimizer and schedules are splitted and instantiated like this:
+### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False) # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler
### and used like this:
diff --git a/docs/source/model_doc/albert.rst b/docs/source/model_doc/albert.rst
index c78426d0c773ba..1a2165ae25bd3c 100644
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -1,15 +1,16 @@
ALBERT
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations `_
-by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. It presents
-two parameter-reduction techniques to lower memory consumption and increase the training speed of BERT:
+The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations
+`__ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma,
+Radu Soricut. It presents two parameter-reduction techniques to lower memory consumption and increase the training
+speed of BERT:
-- Splitting the embedding matrix into two smaller matrices
-- Using repeating layers split among groups
+- Splitting the embedding matrix into two smaller matrices.
+- Using repeating layers split among groups.
The abstract from the paper is the following:
@@ -18,29 +19,29 @@ downstream tasks. However, at some point further model increases become harder d
longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
-self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream
-tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE,
-RACE, and SQuAD benchmarks while having fewer parameters compared to BERT-large.*
+self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream tasks
+with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and
+SQuAD benchmarks while having fewer parameters compared to BERT-large.*
Tips:
-- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
- the right rather than the left.
+- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+ than the left.
- ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
number of (repeating) layers.
-The original code can be found `here `_.
+The original code can be found `here `__.
AlbertConfig
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertConfig
:members:
AlbertTokenizer
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
@@ -48,108 +49,108 @@ AlbertTokenizer
Albert specific outputs
-~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.modeling_albert.AlbertForPreTrainingOutput
+.. autoclass:: transformers.models.albert.modeling_albert.AlbertForPreTrainingOutput
:members:
-.. autoclass:: transformers.modeling_tf_albert.TFAlbertForPreTrainingOutput
+.. autoclass:: transformers.models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput
:members:
AlbertModel
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertModel
- :members:
+ :members: forward
AlbertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForPreTraining
- :members:
+ :members: forward
AlbertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForMaskedLM
- :members:
+ :members: forward
AlbertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForSequenceClassification
- :members:
+ :members: forward
AlbertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForMultipleChoice
:members:
AlbertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForTokenClassification
- :members:
+ :members: forward
AlbertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AlbertForQuestionAnswering
- :members:
+ :members: forward
TFAlbertModel
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertModel
- :members:
+ :members: call
TFAlbertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertForPreTraining
- :members:
+ :members: call
TFAlbertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertForMaskedLM
- :members:
+ :members: call
TFAlbertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertForSequenceClassification
- :members:
+ :members: call
TFAlbertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertForMultipleChoice
- :members:
+ :members: call
TFAlbertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertForTokenClassification
- :members:
+ :members: call
TFAlbertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAlbertForQuestionAnswering
- :members:
+ :members: call
diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
index c3345ac4f4cc7d..d4a81f0c84d59f 100644
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -1,109 +1,165 @@
-AutoModels
------------
+Auto Classes
+-----------------------------------------------------------------------------------------------------------------------
In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you
-are supplying to the ``from_pretrained`` method.
+are supplying to the :obj:`from_pretrained()` method. AutoClasses are here to do this job for you so that you
+automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary.
-AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path
-to the pretrained weights/config/vocabulary:
+Instantiating one of :class:`~transformers.AutoConfig`, :class:`~transformers.AutoModel`, and
+:class:`~transformers.AutoTokenizer` will directly create a class of the relevant architecture. For instance
-Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant
-architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of
-:class:`~transformers.BertModel`).
+.. code-block:: python
-``AutoConfig``
-~~~~~~~~~~~~~~~~~~~~~
+ model = AutoModel.from_pretrained('bert-base-cased')
+
+will create a model that is an instance of :class:`~transformers.BertModel`.
+
+There is one class of :obj:`AutoModel` for each task, and for each backend (PyTorch or TensorFlow).
+
+
+AutoConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoConfig
:members:
-``AutoTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AutoTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoTokenizer
:members:
-``AutoModel``
-~~~~~~~~~~~~~~~~~~~~~
+AutoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoModel
:members:
-``AutoModelForPreTraining``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AutoModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoModelForPreTraining
:members:
-``AutoModelWithLMHead``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AutoModelForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForCausalLM
+ :members:
+
+
+AutoModelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.AutoModelWithLMHead
+.. autoclass:: transformers.AutoModelForMaskedLM
:members:
-``AutoModelForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AutoModelForSeq2SeqLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForSeq2SeqLM
+ :members:
+
+
+AutoModelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoModelForSequenceClassification
:members:
-``AutoModelForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AutoModelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.AutoModelForQuestionAnswering
+.. autoclass:: transformers.AutoModelForMultipleChoice
:members:
-``AutoModelForTokenClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AutoModelForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForNextSentencePrediction
+ :members:
+
+
+AutoModelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.AutoModelForTokenClassification
:members:
-``TFAutoModel``
-~~~~~~~~~~~~~~~~~~~~~
+
+AutoModelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForQuestionAnswering
+ :members:
+
+
+TFAutoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAutoModel
:members:
-``TFAutoModelForPreTraining``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFAutoModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAutoModelForPreTraining
:members:
-``TFAutoModelWithLMHead``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFAutoModelForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForCausalLM
+ :members:
+
+
+TFAutoModelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForMaskedLM
+ :members:
+
+
+TFAutoModelForSeq2SeqLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.TFAutoModelWithLMHead
+.. autoclass:: transformers.TFAutoModelForSeq2SeqLM
:members:
-``TFAutoModelForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFAutoModelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAutoModelForSequenceClassification
:members:
-``TFAutoModelForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFAutoModelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.TFAutoModelForQuestionAnswering
+.. autoclass:: transformers.TFAutoModelForMultipleChoice
:members:
-``TFAutoModelForTokenClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFAutoModelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFAutoModelForTokenClassification
:members:
+
+
+TFAutoModelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForQuestionAnswering
+ :members:
diff --git a/docs/source/model_doc/bart.rst b/docs/source/model_doc/bart.rst
index 69a502cf01217d..f2a111086fbba6 100644
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -1,49 +1,86 @@
-Bart
-----------------------------------------------------
-**DISCLAIMER:** If you see something strange,
-file a `Github Issue `__ and assign
-@sshleifer
+BART
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+`__ and assign
+@patrickvonplaten
Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Bart model was proposed in `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
+Translation, and Comprehension `__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan
+Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
-The Bart model was `proposed `_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
According to the abstract,
-- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT).
-- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme, where spans of text are replaced with a single mask token.
-- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE.
+- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a
+ left-to-right decoder (like GPT).
+- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme,
+ where spans of text are replaced with a single mask token.
+- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It
+ matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new
+ state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
+ of up to 6 ROUGE.
+
+The Authors' code can be found `here `__.
-The Authors' code can be found `here `_
+
+Examples
+_______________________________________________________________________________________________________________________
+
+- Examples and scripts for fine-tuning BART and other models for sequence to sequence tasks can be found in
+ `examples/seq2seq/ `__.
+- An example of how to train :class:`~transformers.BartForConditionalGeneration` with a Hugging Face :obj:`datasets`
+ object can be found in this `forum discussion
+ `__.
+- `Distilled checkpoints `__ are described in this `paper
+ `__.
Implementation Notes
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-- Bart doesn't use :obj:`token_type_ids` for sequence classification. Use BartTokenizer.encode to get the proper splitting.
-- The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``) if they are not passed. This is different than some other modeling APIs.
-- Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space.
-- ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings
-- Models that load the ``"facebook/bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.
-- for training/forward passes that don't involve beam search, pass ``use_cache=False``
+- Bart doesn't use :obj:`token_type_ids` for sequence classification. Use :class:`~transformers.BartTokenizer` or
+ :meth:`~transformers.BartTokenizer.encode` to get the proper splitting.
+- The forward pass of :class:`~transformers.BartModel` will create decoder inputs (using the helper function
+ :func:`transformers.models.bart.modeling_bart._prepare_bart_decoder_inputs`) if they are not passed. This is
+ different than some other modeling APIs.
+- Model predictions are intended to be identical to the original implementation when
+ :obj:`force_bos_token_to_be_generated=True`. This only works, however, if the string you pass to
+ :func:`fairseq.encode` starts with a space.
+- :meth:`~transformers.BartForConditionalGeneration.generate` should be used for conditional generation tasks like
+ summarization, see the example in that docstrings.
+- Models that load the `facebook/bart-large-cnn` weights will not have a :obj:`mask_token_id`, or be able to perform
+ mask-filling tasks.
+- For training/forward passes that don't involve beam search, pass :obj:`use_cache=False`.
+Mask Filling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-BartForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The :obj:`facebook/bart-base` and :obj:`facebook/bart-large` checkpoints can be used to fill multi-token masks.
+
+.. code-block::
+
+ from transformers import BartForConditionalGeneration, BartTokenizer
+ model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", force_bos_token_to_be_generated=True)
+ tok = BartTokenizer.from_pretrained("facebook/bart-large")
+ example_english_phrase = "UN Chief Says There Is No in Syria"
+ batch = tok(example_english_phrase, return_tensors='pt')
+ generated_ids = model.generate(batch['input_ids'])
+ assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']
-.. autoclass:: transformers.BartForConditionalGeneration
- :members: forward
BartConfig
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BartConfig
:members:
BartTokenizer
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BartTokenizer
:members:
@@ -51,25 +88,45 @@ BartTokenizer
BartModel
-~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BartModel
:members: forward
-.. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs
+.. autofunction:: transformers.models.bart.modeling_bart._prepare_bart_decoder_inputs
+
+
+BartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForConditionalGeneration
+ :members: forward
BartForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BartForSequenceClassification
:members: forward
BartForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BartForQuestionAnswering
:members: forward
+
+TFBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBartModel
+ :members: call
+
+
+TFBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBartForConditionalGeneration
+ :members: call
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 13bc47e260d6df..589f6277f8ee81 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -1,13 +1,13 @@
BERT
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding `__
-by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
-pre-trained using a combination of masked language modeling objective and next sentence prediction
-on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
+`__ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a
+bidirectional transformer pretrained using a combination of masked language modeling objective and next sentence
+prediction on a large corpus comprising the Toronto Book Corpus and Wikipedia.
The abstract from the paper is the following:
@@ -25,22 +25,22 @@ improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
Tips:
-- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
- the right rather than the left.
-- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is efficient at predicting masked
- tokens and at NLU in general, but is not optimal for text generation.
+- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+ the left.
+- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is
+ efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation.
-The original code can be found `here `_.
+The original code can be found `here `__.
BertConfig
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertConfig
:members:
BertTokenizer
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
@@ -48,144 +48,150 @@ BertTokenizer
BertTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertTokenizerFast
:members:
Bert specific outputs
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.modeling_bert.BertForPreTrainingOutput
+.. autoclass:: transformers.models.bert.modeling_bert.BertForPreTrainingOutput
:members:
-.. autoclass:: transformers.modeling_tf_bert.TFBertForPreTrainingOutput
+.. autoclass:: transformers.models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
:members:
BertModel
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertModel
- :members:
+ :members: forward
BertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertForPreTraining
- :members:
+ :members: forward
BertModelLMHeadModel
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertLMHeadModel
- :members:
+ :members: forward
BertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertForMaskedLM
- :members:
+ :members: forward
BertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertForNextSentencePrediction
- :members:
+ :members: forward
BertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertForSequenceClassification
- :members:
+ :members: forward
BertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertForMultipleChoice
- :members:
+ :members: forward
BertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertForTokenClassification
- :members:
+ :members: forward
BertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.BertForQuestionAnswering
- :members:
+ :members: forward
TFBertModel
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertModel
- :members:
+ :members: call
TFBertForPreTraining
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertForPreTraining
- :members:
+ :members: call
TFBertModelLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertLMHeadModel
- :members:
+ :members: call
TFBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertForMaskedLM
- :members:
+ :members: call
TFBertForNextSentencePrediction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertForNextSentencePrediction
- :members:
+ :members: call
TFBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertForSequenceClassification
- :members:
+ :members: call
TFBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertForMultipleChoice
- :members:
+ :members: call
TFBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertForTokenClassification
- :members:
+ :members: call
TFBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFBertForQuestionAnswering
- :members:
+ :members: call
+
+
+FlaxBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.FlaxBertModel
+ :members: __call__
diff --git a/docs/source/model_doc/bertgeneration.rst b/docs/source/model_doc/bertgeneration.rst
new file mode 100644
index 00000000000000..9ea904c590e2ba
--- /dev/null
+++ b/docs/source/model_doc/bertgeneration.rst
@@ -0,0 +1,96 @@
+BertGeneration
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BertGeneration model is a BERT model that can be leveraged for sequence-to-sequence tasks using
+:class:`~transformers.EncoderDecoderModel` as proposed in `Leveraging Pre-trained Checkpoints for Sequence Generation
+Tasks `__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+
+The abstract from the paper is the following:
+
+*Unsupervised pre-training of large neural models has recently revolutionized Natural Language Processing. By
+warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
+benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
+Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
+developed a Transformer-based sequence-to-sequence model that is compatible with publicly available pre-trained BERT,
+GPT-2 and RoBERTa checkpoints and conducted an extensive empirical study on the utility of initializing our model, both
+encoder and decoder, with these checkpoints. Our models result in new state-of-the-art results on Machine Translation,
+Text Summarization, Sentence Splitting, and Sentence Fusion.*
+
+Usage:
+
+- The model can be used in combination with the :class:`~transformers.EncoderDecoderModel` to leverage two pretrained
+ BERT checkpoints for subsequent fine-tuning.
+
+.. code-block::
+
+ # leverage checkpoints for Bert2Bert model...
+ # use BERT's cls token as BOS token and sep token as EOS token
+ encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+ # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+ decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
+ bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+
+ # create tokenizer...
+ tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+
+ input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
+ labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+
+ # train...
+ loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
+ loss.backward()
+
+
+- Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, e.g.,
+
+
+.. code-block::
+
+ # instantiate sentence fusion model
+ sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+ tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+
+ input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+
+ outputs = sentence_fuser.generate(input_ids)
+
+ print(tokenizer.decode(outputs[0]))
+
+
+Tips:
+
+- :class:`~transformers.BertGenerationEncoder` and :class:`~transformers.BertGenerationDecoder` should be used in
+ combination with :class:`~transformers.EncoderDecoder`.
+- For summarization, sentence splitting, sentence fusion and translation, no special tokens are required for the input.
+ Therefore, no EOS token should be added to the end of the input.
+
+The original code can be found `here `__.
+
+BertGenerationConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationConfig
+ :members:
+
+
+BertGenerationTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationTokenizer
+ :members: save_vocabulary
+
+BertGenerationEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationEncoder
+ :members: forward
+
+
+BertGenerationDecoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationDecoder
+ :members: forward
diff --git a/docs/source/model_doc/blenderbot.rst b/docs/source/model_doc/blenderbot.rst
new file mode 100644
index 00000000000000..4d79144e8e443e
--- /dev/null
+++ b/docs/source/model_doc/blenderbot.rst
@@ -0,0 +1,106 @@
+Blenderbot
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+`__ .
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Blender chatbot model was proposed in `Recipes for building an open-domain chatbot
+`__ Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+The authors' code can be found `here `__ .
+
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Blenderbot uses a standard `seq2seq model transformer `__ based architecture.
+- It inherits completely from :class:`~transformers.BartForConditionalGeneration`
+- Even though blenderbot is one model, it uses two tokenizers :class:`~transformers.BlenderbotSmallTokenizer` for 90M
+ checkpoint and :class:`~transformers.BlenderbotTokenizer` for all other checkpoints.
+- :class:`~transformers.BlenderbotSmallTokenizer` will always return :class:`~transformers.BlenderbotSmallTokenizer`,
+ regardless of checkpoint. To use the 3B parameter checkpoint, you must call
+ :class:`~transformers.BlenderbotTokenizer` directly.
+- Available checkpoints can be found in the `model hub `__.
+
+
+Usage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here is an example of model usage:
+
+.. code-block::
+
+ >>> from transformers import BlenderbotSmallTokenizer, BlenderbotForConditionalGeneration
+ >>> mname = 'facebook/blenderbot-90M'
+ >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
+ >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained(mname)
+ >>> UTTERANCE = "My friends are cool but they eat too many carbs."
+ >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
+ >>> reply_ids = model.generate(**inputs)
+ >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in reply_ids])
+
+
+Here is how you can check out config values:
+
+.. code-block::
+
+
+ >>> from transformers import BlenderbotConfig
+ >>> config_90 = BlenderbotConfig.from_pretrained("facebook/blenderbot-90M")
+ >>> config_90.to_diff_dict() # show interesting Values.
+ >>> configuration_3B = BlenderbotConfig("facebook/blenderbot-3B")
+ >>> configuration_3B.to_diff_dict()
+
+
+BlenderbotConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotConfig
+ :members:
+
+BlenderbotTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotTokenizer
+ :members: build_inputs_with_special_tokens
+
+BlenderbotSmallTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallTokenizer
+ :members:
+
+
+BlenderbotForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See :obj:`transformers.BartForConditionalGeneration` for arguments to `forward` and `generate`
+
+.. autoclass:: transformers.BlenderbotForConditionalGeneration
+ :members:
+
+
+TFBlenderbotForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See :obj:`transformers.TFBartForConditionalGeneration` for arguments to `forward` and `generate`
+
+.. autoclass:: transformers.TFBlenderbotForConditionalGeneration
+ :members:
diff --git a/docs/source/model_doc/camembert.rst b/docs/source/model_doc/camembert.rst
index 8f0d578848244b..c3a022c87811c2 100644
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -1,41 +1,41 @@
CamemBERT
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model `__
-by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
+The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model `__ by
+Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
trained on 138GB of French text.
The abstract from the paper is the following:
-*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success,
-most available models have either been trained on English data or on the concatenation of data in multiple
-languages. This makes practical use of such models --in all languages except English-- very limited. Aiming
-to address this issue for French, we release CamemBERT, a French version of the Bi-directional Encoders for
-Transformers (BERT). We measure the performance of CamemBERT compared to multilingual models in multiple
-downstream tasks, namely part-of-speech tagging, dependency parsing, named-entity recognition, and natural
-language inference. CamemBERT improves the state of the art for most of the tasks considered. We release the
-pretrained model for CamemBERT hoping to foster research and downstream applications for French NLP.*
+*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
+models have either been trained on English data or on the concatenation of data in multiple languages. This makes
+practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
+we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
+performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
+dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
+for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
+downstream applications for French NLP.*
Tips:
-- This implementation is the same as RoBERTa. Refer to the `documentation of RoBERTa <./roberta.html>`__ for usage
- examples as well as the information relative to the inputs and outputs.
+- This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa ` for usage examples
+ as well as the information relative to the inputs and outputs.
-The original code can be found `here `_.
+The original code can be found `here `__.
CamembertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertConfig
:members:
CamembertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertTokenizer
:members: build_inputs_with_special_tokens, get_special_tokens_mask,
@@ -43,91 +43,91 @@ CamembertTokenizer
CamembertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertModel
:members:
CamembertForCausalLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForCausalLM
:members:
CamembertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForMaskedLM
:members:
CamembertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForSequenceClassification
:members:
CamembertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForMultipleChoice
:members:
CamembertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForTokenClassification
:members:
CamembertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CamembertForQuestionAnswering
:members:
TFCamembertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFCamembertModel
:members:
TFCamembertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFCamembertForMaskedLM
:members:
TFCamembertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFCamembertForSequenceClassification
:members:
TFCamembertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFCamembertForMultipleChoice
:members:
TFCamembertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFCamembertForTokenClassification
:members:
TFCamembertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFCamembertForQuestionAnswering
- :members:
\ No newline at end of file
+ :members:
diff --git a/docs/source/model_doc/ctrl.rst b/docs/source/model_doc/ctrl.rst
index 2683320eb35e72..86bf6dea78bb83 100644
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,80 +1,80 @@
CTRL
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation `_
-by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
-corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
+CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation
+`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and
+Richard Socher. It's a causal (unidirectional) transformer pre-trained using language modeling on a very large corpus
+of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
The abstract from the paper is the following:
*Large-scale language models show promising text generation capabilities, but users cannot easily control particular
aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
-derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning
-while providing more explicit control over text generation. These codes also allow CTRL to predict which parts of
-the training data are most likely given a sequence. This provides a potential method for analyzing large amounts
-of data via model-based source attribution.*
+derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning while
+providing more explicit control over text generation. These codes also allow CTRL to predict which parts of the
+training data are most likely given a sequence. This provides a potential method for analyzing large amounts of data
+via model-based source attribution.*
Tips:
- CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
- or links to generate coherent text. Refer to the `original implementation `__
- for more information.
-- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
- the right rather than the left.
+ or links to generate coherent text. Refer to the `original implementation `__ for
+ more information.
+- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+ the left.
- CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
- token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as
- it can be observed in the `run_generation.py` example script.
+ token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as it can be
+ observed in the `run_generation.py` example script.
- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
- this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
- See `reusing the past in generative models <../quickstart.html#using-the-past>`_ for more information on the usage
- of this argument.
+ this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
+ `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
+ this argument.
-The original code can be found `here `_.
+The original code can be found `here `__.
CTRLConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CTRLConfig
:members:
CTRLTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CTRLTokenizer
:members: save_vocabulary
CTRLModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CTRLModel
- :members:
+ :members: forward
CTRLLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.CTRLLMHeadModel
- :members:
+ :members: forward
TFCTRLModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFCTRLModel
- :members:
+ :members: call
TFCTRLLMHeadModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFCTRLLMHeadModel
- :members:
+ :members: call
diff --git a/docs/source/model_doc/deberta.rst b/docs/source/model_doc/deberta.rst
new file mode 100644
index 00000000000000..e54844f5ffa1c2
--- /dev/null
+++ b/docs/source/model_doc/deberta.rst
@@ -0,0 +1,65 @@
+DeBERTa
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
+`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
+BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
+
+It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
+RoBERTa.
+
+The abstract from the paper is the following:
+
+*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
+language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
+disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
+disentangled attention mechanism, where each word is represented using two vectors that encode its content and
+position, respectively, and the attention weights among words are computed using disentangled matrices on their
+contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
+predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
+of model pre-training and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half
+of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
+(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
+pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
+
+
+The original code can be found `here `__.
+
+
+DebertaConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaConfig
+ :members:
+
+
+DebertaTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaTokenizer
+ :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+ create_token_type_ids_from_sequences, save_vocabulary
+
+
+DebertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaModel
+ :members:
+
+
+DebertaPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaPreTrainedModel
+ :members:
+
+
+DebertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaForSequenceClassification
+ :members:
diff --git a/docs/source/model_doc/dialogpt.rst b/docs/source/model_doc/dialogpt.rst
index 4381698829bb8d..f310208968bfd0 100644
--- a/docs/source/model_doc/dialogpt.rst
+++ b/docs/source/model_doc/dialogpt.rst
@@ -1,39 +1,42 @@
DialoGPT
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-DialoGPT was proposed in
-`DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation `_
-by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-It's a GPT2 Model trained on 147M conversation-like exchanges extracted from Reddit.
+DialoGPT was proposed in `DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation
+`_ by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao,
+Jianfeng Gao, Jingjing Liu, Bill Dolan. It's a GPT2 Model trained on 147M conversation-like exchanges extracted from
+Reddit.
The abstract from the paper is the following:
-*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained transformer).
-Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human both in terms of automatic and human evaluation in single-turn dialogue settings.
-We show that conversational systems that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline systems.
-The pre-trained model and training pipeline are publicly released to facilitate research into neural response generation and the development of more intelligent open-domain dialogue systems.*
+*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained
+transformer). Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning
+from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human
+both in terms of automatic and human evaluation in single-turn dialogue settings. We show that conversational systems
+that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline
+systems. The pre-trained model and training pipeline are publicly released to facilitate research into neural response
+generation and the development of more intelligent open-domain dialogue systems.*
Tips:
-- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
- the right rather than the left.
-- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful at response generation in open-domain dialogue systems.
-- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on `DialoGPT's model card `_.
+- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+ than the left.
+- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful
+ at response generation in open-domain dialogue systems.
+- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on `DialoGPT's model card
+ `_.
Training:
-In order to train or fine-tune DialoGPT, one can use causal language modeling training.
-To cite the official paper:
-*We follow the OpenAI GPT-2 to model a multiturn dialogue session
-as a long text and frame the generation task as language modeling. We first
-concatenate all dialog turns within a dialogue session into a long text
-x_1,..., x_N (N is the sequence length), ended by the end-of-text token.*
-For more information please confer to the original paper.
-
+In order to train or fine-tune DialoGPT, one can use causal language modeling training. To cite the official paper: *We
+follow the OpenAI GPT-2 to model a multiturn dialogue session as a long text and frame the generation task as language
+modeling. We first concatenate all dialog turns within a dialogue session into a long text x_1,..., x_N (N is the
+sequence length), ended by the end-of-text token.* For more information please confer to the original paper.
-DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring `_.
+
+DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring
+`_.
The original code can be found `here `_.
diff --git a/docs/source/model_doc/distilbert.rst b/docs/source/model_doc/distilbert.rst
index 67f27495bd2197..7320d88573b305 100644
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -1,15 +1,15 @@
DistilBERT
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The DistilBERT model was proposed in the blog post
-`Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT `__,
-and the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter `__.
-DistilBERT is a small, fast, cheap and light Transformer model trained by distilling Bert base. It has 40% less
-parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of Bert's performances as measured on
-the GLUE language understanding benchmark.
+The DistilBERT model was proposed in the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
+distilled version of BERT `__, and the paper `DistilBERT, a
+distilled version of BERT: smaller, faster, cheaper and lighter `__. DistilBERT is a
+small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
+`bert-base-uncased`, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
+understanding benchmark.
The abstract from the paper is the following:
@@ -17,123 +17,126 @@ The abstract from the paper is the following:
operating these large models in on-the-edge and/or under constrained computational training or inference budgets
remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
-counterparts. While most prior work investigated the use of distillation for building task-specific models, we
-leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a
-BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage
-the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language
-modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train
-and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative
-on-device study.*
+counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
+knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a BERT model by
+40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
+biases learned by larger models during pre-training, we introduce a triple loss combining language modeling,
+distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
+demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
+study.*
Tips:
-- DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`)
-- DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option.
+- DistilBERT doesn't have :obj:`token_type_ids`, you don't need to indicate which token belongs to which segment. Just
+ separate your segments with the separation token :obj:`tokenizer.sep_token` (or :obj:`[SEP]`).
+- DistilBERT doesn't have options to select the input positions (:obj:`position_ids` input). This could be added if
+ necessary though, just let us know if you need this option.
-The original code can be found `here `_.
+The original code can be found `here
+`__.
DistilBertConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertConfig
:members:
DistilBertTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertTokenizer
:members:
DistilBertTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertTokenizerFast
:members:
DistilBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertModel
- :members:
+ :members: forward
DistilBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertForMaskedLM
- :members:
+ :members: forward
DistilBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertForSequenceClassification
- :members:
+ :members: forward
DistilBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertForMultipleChoice
- :members:
+ :members: forward
DistilBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertForTokenClassification
- :members:
+ :members: forward
DistilBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.DistilBertForQuestionAnswering
- :members:
+ :members: forward
TFDistilBertModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFDistilBertModel
- :members:
+ :members: call
TFDistilBertForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFDistilBertForMaskedLM
- :members:
+ :members: call
TFDistilBertForSequenceClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFDistilBertForSequenceClassification
- :members:
+ :members: call
TFDistilBertForMultipleChoice
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFDistilBertForMultipleChoice
- :members:
+ :members: call
TFDistilBertForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFDistilBertForTokenClassification
- :members:
+ :members: call
TFDistilBertForQuestionAnswering
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autoclass:: transformers.TFDistilBertForQuestionAnswering
- :members:
+ :members: call
diff --git a/docs/source/model_doc/dpr.rst b/docs/source/model_doc/dpr.rst
index a77d3868bf435b..86a60ff15daaa1 100644
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@@ -1,13 +1,12 @@
DPR
-----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain Q&A research.
-It is based on the following paper:
-
-Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih, Dense Passage Retrieval for Open-Domain Question Answering.
+Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
+intorduced in `Dense Passage Retrieval for Open-Domain Question Answering `__ by
+Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.
The abstract from the paper is the following:
@@ -19,84 +18,103 @@ our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% ab
retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA
benchmarks.*
-The original code can be found `here `_.
+The original code can be found `here