From 8f21968f40306ce3d7bd69cb085a91b35f033f57 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 09:08:06 +0000
Subject: [PATCH 01/10] add examples subfolder

---
 .../codeparrot/examples/README.md             |  16 ++
 .../codeparrot/examples/requirements.txt      |   5 +
 .../examples/train_complexity_predictor.py    | 154 ++++++++++++++++++
 3 files changed, 175 insertions(+)
 create mode 100644 examples/research_projects/codeparrot/examples/README.md
 create mode 100644 examples/research_projects/codeparrot/examples/requirements.txt
 create mode 100644 examples/research_projects/codeparrot/examples/train_complexity_predictor.py

diff --git a/examples/research_projects/codeparrot/examples/README.md b/examples/research_projects/codeparrot/examples/README.md
new file mode 100644
index 000000000000..7c8e59688618
--- /dev/null
+++ b/examples/research_projects/codeparrot/examples/README.md
@@ -0,0 +1,16 @@
+# Examples
+In this folder we showcase some examples to use code models for downstream tasks.
+
+## Complexity prediction
+In this task we want to predict the complexity of Java programs in [CodeComplex](https://huggingface.co/datasets/codeparrot/codecomplex) dataset. Using Hugging Face `trainer`, we finetuned [multilingual CodeParrot](https://huggingface.co/codeparrot/codeparrot-small-multi) and [UniXcoder](https://huggingface.co/microsoft/unixcoder-base-nine) on it, and we used the latter to build this Java complexity prediction [space](https://huggingface.co/spaces/codeparrot/code-complexity-predictor) on Hugging Face hub.
+
+To fine-tune a model on this dataset you can use the following commands:
+
+```python
+python train_complexity_predictor.py \
+    --model_ckpt microsoft/unixcoder-base-nine \
+    --num_epochs 60 \
+    --num_warmup_steps 10 \
+    --batch_size 8 \
+    --learning_rate 5e-4 
+```
\ No newline at end of file
diff --git a/examples/research_projects/codeparrot/examples/requirements.txt b/examples/research_projects/codeparrot/examples/requirements.txt
new file mode 100644
index 000000000000..997334e27e18
--- /dev/null
+++ b/examples/research_projects/codeparrot/examples/requirements.txt
@@ -0,0 +1,5 @@
+datasets==2.3.2
+transformers==4.21.1
+wandb==0.13.1
+evaluate==0.2.2
+scikit-learn==1.1.2
\ No newline at end of file
diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
new file mode 100644
index 000000000000..fa3e09fd602a
--- /dev/null
+++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
@@ -0,0 +1,154 @@
+import numpy as np
+import argparse
+from copy import deepcopy
+
+from torch.optim import AdamW
+from datasets import load_dataset, DatasetDict, ClassLabel
+from evaluate import load
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    TrainerCallback,
+    DataCollatorWithPadding,
+    get_scheduler,
+    set_seed,
+)
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine")
+    parser.add_argument("--num_epochs", type=int, default=5)
+    parser.add_argument("--batch_size", type=int, default=6)
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
+    parser.add_argument("--freeze", type=bool, default=True)
+    parser.add_argument("--learning_rate", type=float, default=5e-4)
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
+    parser.add_argument("--num_warmup_steps", type=int, default=10)
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    return parser.parse_args()
+
+
+metric = load("accuracy")
+
+
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    predictions = np.argmax(predictions, axis=1)
+    return metric.compute(predictions=predictions, references=labels)
+
+
+def get_grouped_params(model, args, no_decay=["bias", "ln_1.weight", "ln_2.weight", "ln_f.weight"]):
+    params_with_wd, params_without_wd = [], []
+    for n, p in model.named_parameters():
+        if any(nd in n for nd in no_decay):
+            params_without_wd.append(p)
+        else:
+            params_with_wd.append(p)
+    return [
+        {"params": params_with_wd, "weight_decay": args.weight_decay},
+        {"params": params_without_wd, "weight_decay": 0.0},
+    ]
+
+
+class CustomCallback(TrainerCallback):
+    def __init__(self, trainer) -> None:
+        super().__init__()
+        self._trainer = trainer
+
+    def on_epoch_end(self, args, state, control, **kwargs):
+        if control.should_evaluate:
+            control_copy = deepcopy(control)
+            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
+            return control_copy
+
+
+def main():
+    args = get_args()
+    set_seed(args.seed)
+
+    dataset = load_dataset("codeparrot/codecomplex", split="train")
+    train_test = dataset.train_test_split(test_size=0.2)
+    test_validation = train_test["test"].train_test_split(test_size=0.5)
+    train_test_validation = DatasetDict(
+        {
+            "train": train_test["train"],
+            "test": test_validation["train"],
+            "valid": test_validation["test"],
+        }
+    )
+
+    print("Loading tokenizer and model")
+    tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
+    tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7)
+    model.config.pad_token_id = model.config.eos_token_id
+
+    if args.freeze:
+        for param in model.roberta.parameters():
+            param.requires_grad = False
+
+    labels = ClassLabel(num_classes=7, names=list(set(train_test_validation["train"]["complexity"])))
+
+    def tokenize(example):
+        inputs = tokenizer(example["src"], truncation=True, max_length=1024)
+        label = labels.str2int(example["complexity"])
+        return {
+            "input_ids": inputs["input_ids"],
+            "attention_mask": inputs["attention_mask"],
+            "label": label,
+        }
+
+    tokenized_datasets = train_test_validation.map(
+        tokenize,
+        batched=True,
+        remove_columns=train_test_validation["train"].column_names,
+    )
+    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
+
+    # Prepare the optimizer and learning rate scheduler
+    optimizer = AdamW(get_grouped_params(model, args), lr=args.learning_rate)
+    lr_scheduler = get_scheduler(
+        name=args.lr_scheduler_type,
+        optimizer=optimizer,
+        num_training_steps=args.num_epochs,
+        num_warmup_steps=args.num_warmup_steps,
+    )
+
+    training_args = TrainingArguments(
+        output_dir="./results_java",
+        learning_rate=args.learning_rate,
+        evaluation_strategy="epoch",
+        save_strategy="epoch",
+        logging_strategy="epoch",
+        per_device_train_batch_size=args.batch_size,
+        per_device_eval_batch_size=args.batch_size,
+        num_train_epochs=args.num_epochs,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        weight_decay=0.01,
+        metric_for_best_model="accuracy",
+        run_name="complexity-java",
+        report_to="wandb",
+    )
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_datasets["train"],
+        eval_dataset=tokenized_datasets["valid"],
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        compute_metrics=compute_metrics,
+        optimizers=(optimizer, lr_scheduler),
+    )
+
+    print("Training...")
+    trainer.add_callback(CustomCallback(trainer))
+    trainer.train()
+
+
+if __name__ == "__main__":
+    main()

From f8263d18df97e0710f04cdb87a350854e145b8ba Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 09:08:19 +0000
Subject: [PATCH 02/10] reformat file

---
 .../codeparrot/scripts/minhash_deduplication.py                  | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
index cd72dcb70c9e..9e1ef11ff07d 100644
--- a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
+++ b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py
@@ -63,7 +63,6 @@ def add(self, code_key: Tuple, min_hash: MinHash) -> None:
 
         self._index.insert(code_key, min_hash)
         if len(close_duplicates) > 0:
-
             for base_duplicate in close_duplicates:
                 if base_duplicate in self._duplicate_clusters:
                     self._duplicate_clusters[base_duplicate].add(code_key)

From 3794de6e54302162cad71170af03e89adbd0ffc4 Mon Sep 17 00:00:00 2001
From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com>
Date: Thu, 18 Aug 2022 11:17:06 +0200
Subject: [PATCH 03/10] mention examples in codeparrot readme

---
 examples/research_projects/codeparrot/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index ef92606c545a..076ac245d1dd 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -12,7 +12,7 @@ This is an open-source effort to train and evaluate code generation models. Code
 - continuously push checkpoints to the hub with `huggingface_hub`
 - stream the dataset with `datasets` during training to avoid disk bottlenecks
 - apply the `code_eval` metric in `datasets` to evaluate on [OpenAI's _HumanEval_ benchmark](https://huggingface.co/datasets/openai_humaneval)
-
+- showcase examples for downstream tasks with code models in [examples](https://github.com/huggingface/transformers/tree/main/examples/research_projects/codeparrot/examples) folder
 ## Installation
 To install the dependencies simply run the following command:
 ```bash

From c9612538a454f59fa0b5ee682e2c171855ee559f Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 09:33:33 +0000
Subject: [PATCH 04/10] reformat imports

---
 .../codeparrot/examples/train_complexity_predictor.py    | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
index fa3e09fd602a..b2ee7e5eb9f8 100644
--- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
+++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
@@ -1,17 +1,18 @@
-import numpy as np
 import argparse
 from copy import deepcopy
 
+import numpy as np
+from datasets import ClassLabel, DatasetDict, load_dataset
 from torch.optim import AdamW
-from datasets import load_dataset, DatasetDict, ClassLabel
+
 from evaluate import load
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
-    TrainingArguments,
+    DataCollatorWithPadding,
     Trainer,
     TrainerCallback,
-    DataCollatorWithPadding,
+    TrainingArguments,
     get_scheduler,
     set_seed,
 )

From 7ab673e8a791dc56325c3b7afdb222c914cf9e39 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 12:36:46 +0000
Subject: [PATCH 05/10] use Trainer optimizer and scheduler type and add
 output_dir as argument

---
 .../examples/train_complexity_predictor.py    | 59 ++++++-------------
 1 file changed, 19 insertions(+), 40 deletions(-)

diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
index b2ee7e5eb9f8..b7be3131c5e6 100644
--- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
+++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
@@ -3,24 +3,18 @@
 
 import numpy as np
 from datasets import ClassLabel, DatasetDict, load_dataset
-from torch.optim import AdamW
-
 from evaluate import load
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    Trainer,
-    TrainerCallback,
-    TrainingArguments,
-    get_scheduler,
-    set_seed,
-)
+
+from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
+                          DataCollatorWithPadding, Trainer, TrainerCallback,
+                          TrainingArguments, set_seed)
 
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine")
+    parser.add_argument(
+        "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
+    )
     parser.add_argument("--num_epochs", type=int, default=5)
     parser.add_argument("--batch_size", type=int, default=6)
     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
@@ -30,6 +24,7 @@ def get_args():
     parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
     parser.add_argument("--num_warmup_steps", type=int, default=10)
     parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--output_dir", type=str, default="./results")
     return parser.parse_args()
 
 
@@ -42,19 +37,6 @@ def compute_metrics(eval_pred):
     return metric.compute(predictions=predictions, references=labels)
 
 
-def get_grouped_params(model, args, no_decay=["bias", "ln_1.weight", "ln_2.weight", "ln_f.weight"]):
-    params_with_wd, params_without_wd = [], []
-    for n, p in model.named_parameters():
-        if any(nd in n for nd in no_decay):
-            params_without_wd.append(p)
-        else:
-            params_with_wd.append(p)
-    return [
-        {"params": params_with_wd, "weight_decay": args.weight_decay},
-        {"params": params_without_wd, "weight_decay": 0.0},
-    ]
-
-
 class CustomCallback(TrainerCallback):
     def __init__(self, trainer) -> None:
         super().__init__()
@@ -63,7 +45,9 @@ def __init__(self, trainer) -> None:
     def on_epoch_end(self, args, state, control, **kwargs):
         if control.should_evaluate:
             control_copy = deepcopy(control)
-            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
+            self._trainer.evaluate(
+                eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
+            )
             return control_copy
 
 
@@ -85,14 +69,18 @@ def main():
     print("Loading tokenizer and model")
     tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
     tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7)
+    model = AutoModelForSequenceClassification.from_pretrained(
+        args.model_ckpt, num_labels=7
+    )
     model.config.pad_token_id = model.config.eos_token_id
 
     if args.freeze:
         for param in model.roberta.parameters():
             param.requires_grad = False
 
-    labels = ClassLabel(num_classes=7, names=list(set(train_test_validation["train"]["complexity"])))
+    labels = ClassLabel(
+        num_classes=7, names=list(set(train_test_validation["train"]["complexity"]))
+    )
 
     def tokenize(example):
         inputs = tokenizer(example["src"], truncation=True, max_length=1024)
@@ -110,18 +98,10 @@ def tokenize(example):
     )
     data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 
-    # Prepare the optimizer and learning rate scheduler
-    optimizer = AdamW(get_grouped_params(model, args), lr=args.learning_rate)
-    lr_scheduler = get_scheduler(
-        name=args.lr_scheduler_type,
-        optimizer=optimizer,
-        num_training_steps=args.num_epochs,
-        num_warmup_steps=args.num_warmup_steps,
-    )
-
     training_args = TrainingArguments(
-        output_dir="./results_java",
+        output_dir=args.output_dir,
         learning_rate=args.learning_rate,
+        lr_scheduler_type=args.lr_scheduler_type,
         evaluation_strategy="epoch",
         save_strategy="epoch",
         logging_strategy="epoch",
@@ -143,7 +123,6 @@ def tokenize(example):
         tokenizer=tokenizer,
         data_collator=data_collator,
         compute_metrics=compute_metrics,
-        optimizers=(optimizer, lr_scheduler),
     )
 
     print("Training...")

From 1015a06e7663c1171d2e4fbe3da4d4c8758cfb28 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 13:02:45 +0000
Subject: [PATCH 06/10] add example of text-to-python and python-to-text models

---
 .../codeparrot/examples/README.md             | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/examples/research_projects/codeparrot/examples/README.md b/examples/research_projects/codeparrot/examples/README.md
index 7c8e59688618..c1980262d827 100644
--- a/examples/research_projects/codeparrot/examples/README.md
+++ b/examples/research_projects/codeparrot/examples/README.md
@@ -13,4 +13,46 @@ python train_complexity_predictor.py \
     --num_warmup_steps 10 \
     --batch_size 8 \
     --learning_rate 5e-4 
+```
+
+## Code generation: text to python
+In this task we want to train a model to generate code from english text. We finetuned Codeparrot-small on [github-jupyter-text-to-code](https://huggingface.co/datasets/codeparrot/github-jupyter-text-to-code), a dataset where the samples are a succession of docstrings and their Python code, originally extracted from Jupyter notebooks parsed in this [dataset](https://huggingface.co/datasets/codeparrot/github-jupyter-parsed).
+
+To fine-tune a model on this dataset we use the same [script](https://github.com/huggingface/transformers/blob/main/examples/research_projects/codeparrot/scripts/codeparrot_training.py) as the pretraining of codeparrot:
+
+```python
+accelerate launch scripts/codeparrot_training.py \
+    --model_ckpt codeparrot/codeparrot-small \
+    --dataset_name_train codeparrot/github-jupyter-text-to-code \
+    --dataset_name_valid codeparrot/github-jupyter-text-to-code \
+    --train_batch_size 12 \
+    --valid_batch_size 12 \
+    --learning_rate 5e-4 \
+    --num_warmup_steps 100 \
+    --gradient_accumulation 1 \
+    --gradient_checkpointing False \
+    --max_train_steps 3000 \
+    --save_checkpoint_steps 200 \
+    --save_dir jupyter-text-to-python
+```
+
+## Code explanation: python to text
+In this task we want to train a model to explain python code. We finetuned Codeparrot-small on [github-jupyter-code-to-text](https://huggingface.co/datasets/codeparrot/github-jupyter-code-to-text), a dataset where the samples are a succession of Python code and its explanation as a docstring, we just inverted the order of text and code pairs in github-jupyter-code-to-text dataset and added the delimiters "Explanation:" and "End of explanation" inside the doctrings.
+
+To fine-tune a model on this dataset we use the same [script](https://github.com/huggingface/transformers/blob/main/examples/research_projects/codeparrot/scripts/codeparrot_training.py) as the pretraining of codeparrot:
+
+```python
+accelerate launch scripts/codeparrot_training.py \
+    --model_ckpt codeparrot/codeparrot-small \
+    --dataset_name_train codeparrot/github-jupyter-code-to-text \
+    --dataset_name_valid codeparrot/github-jupyter-code-to-text \
+    --train_batch_size 12 \
+    --valid_batch_size 12 \
+    --learning_rate 5e-4 \
+    --num_warmup_steps 100 \
+    --gradient_accumulation 1 \
+    --gradient_checkpointing False \
+    --max_train_steps 3000 \
+    --save_checkpoint_steps 200 \
+    --save_dir jupyter-python-to-text
 ```
\ No newline at end of file

From fa8a0df4c10a98557c7026d2cd96b311e8c2a8b2 Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 13:05:18 +0000
Subject: [PATCH 07/10] reformat imports

---
 .../examples/train_complexity_predictor.py           | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
index b7be3131c5e6..9fbde6416571 100644
--- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
+++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
@@ -5,9 +5,15 @@
 from datasets import ClassLabel, DatasetDict, load_dataset
 from evaluate import load
 
-from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
-                          DataCollatorWithPadding, Trainer, TrainerCallback,
-                          TrainingArguments, set_seed)
+from transformers import (
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    Trainer,
+    TrainerCallback,
+    TrainingArguments,
+    set_seed,
+)
 
 
 def get_args():

From da21da8183a98dad02a7f60d2b1abcf322a1e5ff Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 13:08:32 +0000
Subject: [PATCH 08/10] mention the downstream examples in the readme

---
 examples/research_projects/codeparrot/README.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index 076ac245d1dd..a227b29a1691 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -12,7 +12,11 @@ This is an open-source effort to train and evaluate code generation models. Code
 - continuously push checkpoints to the hub with `huggingface_hub`
 - stream the dataset with `datasets` during training to avoid disk bottlenecks
 - apply the `code_eval` metric in `datasets` to evaluate on [OpenAI's _HumanEval_ benchmark](https://huggingface.co/datasets/openai_humaneval)
-- showcase examples for downstream tasks with code models in [examples](https://github.com/huggingface/transformers/tree/main/examples/research_projects/codeparrot/examples) folder
+- showcase examples for downstream tasks with code models in [examples](https://github.com/huggingface/transformers/tree/main/examples/research_projects/codeparrot/examples) folder:
+    - Algorithmic complexity prediction
+    - Code eneration from english text
+    - Code explanation
+    
 ## Installation
 To install the dependencies simply run the following command:
 ```bash

From 17e508d914bd8b7c7ead504f5fb995d55d225e4f Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 13:09:32 +0000
Subject: [PATCH 09/10] reformat code

---
 .../examples/train_complexity_predictor.py     | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
index 9fbde6416571..8fc30b912468 100644
--- a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
+++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 from datasets import ClassLabel, DatasetDict, load_dataset
-from evaluate import load
 
+from evaluate import load
 from transformers import (
     AutoModelForSequenceClassification,
     AutoTokenizer,
@@ -18,9 +18,7 @@
 
 def get_args():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--model_ckpt", type=str, default="microsoft/unixcoder-base-nine"
-    )
+    parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine")
     parser.add_argument("--num_epochs", type=int, default=5)
     parser.add_argument("--batch_size", type=int, default=6)
     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
@@ -51,9 +49,7 @@ def __init__(self, trainer) -> None:
     def on_epoch_end(self, args, state, control, **kwargs):
         if control.should_evaluate:
             control_copy = deepcopy(control)
-            self._trainer.evaluate(
-                eval_dataset=self._trainer.train_dataset, metric_key_prefix="train"
-            )
+            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
             return control_copy
 
 
@@ -75,18 +71,14 @@ def main():
     print("Loading tokenizer and model")
     tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt)
     tokenizer.pad_token = tokenizer.eos_token
-    model = AutoModelForSequenceClassification.from_pretrained(
-        args.model_ckpt, num_labels=7
-    )
+    model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7)
     model.config.pad_token_id = model.config.eos_token_id
 
     if args.freeze:
         for param in model.roberta.parameters():
             param.requires_grad = False
 
-    labels = ClassLabel(
-        num_classes=7, names=list(set(train_test_validation["train"]["complexity"]))
-    )
+    labels = ClassLabel(num_classes=7, names=list(set(train_test_validation["train"]["complexity"])))
 
     def tokenize(example):
         inputs = tokenizer(example["src"], truncation=True, max_length=1024)

From 6f49ca4f5b0837fc4c1053f8cea1f89215d152ad Mon Sep 17 00:00:00 2001
From: loubnabnl <loubnabenallal1999@gmail.com>
Date: Thu, 18 Aug 2022 13:18:22 +0000
Subject: [PATCH 10/10] fix typo

---
 examples/research_projects/codeparrot/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
index a227b29a1691..6c57c4350fbc 100644
--- a/examples/research_projects/codeparrot/README.md
+++ b/examples/research_projects/codeparrot/README.md
@@ -14,7 +14,7 @@ This is an open-source effort to train and evaluate code generation models. Code
 - apply the `code_eval` metric in `datasets` to evaluate on [OpenAI's _HumanEval_ benchmark](https://huggingface.co/datasets/openai_humaneval)
 - showcase examples for downstream tasks with code models in [examples](https://github.com/huggingface/transformers/tree/main/examples/research_projects/codeparrot/examples) folder:
     - Algorithmic complexity prediction
-    - Code eneration from english text
+    - Code generation from english text
     - Code explanation
     
 ## Installation