diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md index ef92606c545a78..6c57c4350fbc02 100644 --- a/examples/research_projects/codeparrot/README.md +++ b/examples/research_projects/codeparrot/README.md @@ -12,7 +12,11 @@ This is an open-source effort to train and evaluate code generation models. Code - continuously push checkpoints to the hub with `huggingface_hub` - stream the dataset with `datasets` during training to avoid disk bottlenecks - apply the `code_eval` metric in `datasets` to evaluate on [OpenAI's _HumanEval_ benchmark](https://huggingface.co/datasets/openai_humaneval) - +- showcase examples for downstream tasks with code models in [examples](https://github.com/huggingface/transformers/tree/main/examples/research_projects/codeparrot/examples) folder: + - Algorithmic complexity prediction + - Code generation from english text + - Code explanation + ## Installation To install the dependencies simply run the following command: ```bash diff --git a/examples/research_projects/codeparrot/examples/README.md b/examples/research_projects/codeparrot/examples/README.md new file mode 100644 index 00000000000000..c1980262d8275b --- /dev/null +++ b/examples/research_projects/codeparrot/examples/README.md @@ -0,0 +1,58 @@ +# Examples +In this folder we showcase some examples to use code models for downstream tasks. + +## Complexity prediction +In this task we want to predict the complexity of Java programs in [CodeComplex](https://huggingface.co/datasets/codeparrot/codecomplex) dataset. Using Hugging Face `trainer`, we finetuned [multilingual CodeParrot](https://huggingface.co/codeparrot/codeparrot-small-multi) and [UniXcoder](https://huggingface.co/microsoft/unixcoder-base-nine) on it, and we used the latter to build this Java complexity prediction [space](https://huggingface.co/spaces/codeparrot/code-complexity-predictor) on Hugging Face hub. + +To fine-tune a model on this dataset you can use the following commands: + +```python +python train_complexity_predictor.py \ + --model_ckpt microsoft/unixcoder-base-nine \ + --num_epochs 60 \ + --num_warmup_steps 10 \ + --batch_size 8 \ + --learning_rate 5e-4 +``` + +## Code generation: text to python +In this task we want to train a model to generate code from english text. We finetuned Codeparrot-small on [github-jupyter-text-to-code](https://huggingface.co/datasets/codeparrot/github-jupyter-text-to-code), a dataset where the samples are a succession of docstrings and their Python code, originally extracted from Jupyter notebooks parsed in this [dataset](https://huggingface.co/datasets/codeparrot/github-jupyter-parsed). + +To fine-tune a model on this dataset we use the same [script](https://github.com/huggingface/transformers/blob/main/examples/research_projects/codeparrot/scripts/codeparrot_training.py) as the pretraining of codeparrot: + +```python +accelerate launch scripts/codeparrot_training.py \ + --model_ckpt codeparrot/codeparrot-small \ + --dataset_name_train codeparrot/github-jupyter-text-to-code \ + --dataset_name_valid codeparrot/github-jupyter-text-to-code \ + --train_batch_size 12 \ + --valid_batch_size 12 \ + --learning_rate 5e-4 \ + --num_warmup_steps 100 \ + --gradient_accumulation 1 \ + --gradient_checkpointing False \ + --max_train_steps 3000 \ + --save_checkpoint_steps 200 \ + --save_dir jupyter-text-to-python +``` + +## Code explanation: python to text +In this task we want to train a model to explain python code. We finetuned Codeparrot-small on [github-jupyter-code-to-text](https://huggingface.co/datasets/codeparrot/github-jupyter-code-to-text), a dataset where the samples are a succession of Python code and its explanation as a docstring, we just inverted the order of text and code pairs in github-jupyter-code-to-text dataset and added the delimiters "Explanation:" and "End of explanation" inside the doctrings. + +To fine-tune a model on this dataset we use the same [script](https://github.com/huggingface/transformers/blob/main/examples/research_projects/codeparrot/scripts/codeparrot_training.py) as the pretraining of codeparrot: + +```python +accelerate launch scripts/codeparrot_training.py \ + --model_ckpt codeparrot/codeparrot-small \ + --dataset_name_train codeparrot/github-jupyter-code-to-text \ + --dataset_name_valid codeparrot/github-jupyter-code-to-text \ + --train_batch_size 12 \ + --valid_batch_size 12 \ + --learning_rate 5e-4 \ + --num_warmup_steps 100 \ + --gradient_accumulation 1 \ + --gradient_checkpointing False \ + --max_train_steps 3000 \ + --save_checkpoint_steps 200 \ + --save_dir jupyter-python-to-text +``` \ No newline at end of file diff --git a/examples/research_projects/codeparrot/examples/requirements.txt b/examples/research_projects/codeparrot/examples/requirements.txt new file mode 100644 index 00000000000000..997334e27e18fc --- /dev/null +++ b/examples/research_projects/codeparrot/examples/requirements.txt @@ -0,0 +1,5 @@ +datasets==2.3.2 +transformers==4.21.1 +wandb==0.13.1 +evaluate==0.2.2 +scikit-learn==1.1.2 \ No newline at end of file diff --git a/examples/research_projects/codeparrot/examples/train_complexity_predictor.py b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py new file mode 100644 index 00000000000000..8fc30b912468ba --- /dev/null +++ b/examples/research_projects/codeparrot/examples/train_complexity_predictor.py @@ -0,0 +1,132 @@ +import argparse +from copy import deepcopy + +import numpy as np +from datasets import ClassLabel, DatasetDict, load_dataset + +from evaluate import load +from transformers import ( + AutoModelForSequenceClassification, + AutoTokenizer, + DataCollatorWithPadding, + Trainer, + TrainerCallback, + TrainingArguments, + set_seed, +) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_ckpt", type=str, default="microsoft/unixcoder-base-nine") + parser.add_argument("--num_epochs", type=int, default=5) + parser.add_argument("--batch_size", type=int, default=6) + parser.add_argument("--gradient_accumulation_steps", type=int, default=1) + parser.add_argument("--freeze", type=bool, default=True) + parser.add_argument("--learning_rate", type=float, default=5e-4) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--lr_scheduler_type", type=str, default="cosine") + parser.add_argument("--num_warmup_steps", type=int, default=10) + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--output_dir", type=str, default="./results") + return parser.parse_args() + + +metric = load("accuracy") + + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return metric.compute(predictions=predictions, references=labels) + + +class CustomCallback(TrainerCallback): + def __init__(self, trainer) -> None: + super().__init__() + self._trainer = trainer + + def on_epoch_end(self, args, state, control, **kwargs): + if control.should_evaluate: + control_copy = deepcopy(control) + self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train") + return control_copy + + +def main(): + args = get_args() + set_seed(args.seed) + + dataset = load_dataset("codeparrot/codecomplex", split="train") + train_test = dataset.train_test_split(test_size=0.2) + test_validation = train_test["test"].train_test_split(test_size=0.5) + train_test_validation = DatasetDict( + { + "train": train_test["train"], + "test": test_validation["train"], + "valid": test_validation["test"], + } + ) + + print("Loading tokenizer and model") + tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) + tokenizer.pad_token = tokenizer.eos_token + model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, num_labels=7) + model.config.pad_token_id = model.config.eos_token_id + + if args.freeze: + for param in model.roberta.parameters(): + param.requires_grad = False + + labels = ClassLabel(num_classes=7, names=list(set(train_test_validation["train"]["complexity"]))) + + def tokenize(example): + inputs = tokenizer(example["src"], truncation=True, max_length=1024) + label = labels.str2int(example["complexity"]) + return { + "input_ids": inputs["input_ids"], + "attention_mask": inputs["attention_mask"], + "label": label, + } + + tokenized_datasets = train_test_validation.map( + tokenize, + batched=True, + remove_columns=train_test_validation["train"].column_names, + ) + data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + training_args = TrainingArguments( + output_dir=args.output_dir, + learning_rate=args.learning_rate, + lr_scheduler_type=args.lr_scheduler_type, + evaluation_strategy="epoch", + save_strategy="epoch", + logging_strategy="epoch", + per_device_train_batch_size=args.batch_size, + per_device_eval_batch_size=args.batch_size, + num_train_epochs=args.num_epochs, + gradient_accumulation_steps=args.gradient_accumulation_steps, + weight_decay=0.01, + metric_for_best_model="accuracy", + run_name="complexity-java", + report_to="wandb", + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_datasets["train"], + eval_dataset=tokenized_datasets["valid"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) + + print("Training...") + trainer.add_callback(CustomCallback(trainer)) + trainer.train() + + +if __name__ == "__main__": + main() diff --git a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py index cd72dcb70c9ed2..9e1ef11ff07d15 100644 --- a/examples/research_projects/codeparrot/scripts/minhash_deduplication.py +++ b/examples/research_projects/codeparrot/scripts/minhash_deduplication.py @@ -63,7 +63,6 @@ def add(self, code_key: Tuple, min_hash: MinHash) -> None: self._index.insert(code_key, min_hash) if len(close_duplicates) > 0: - for base_duplicate in close_duplicates: if base_duplicate in self._duplicate_clusters: self._duplicate_clusters[base_duplicate].add(code_key)