Skip to content

Commit

Permalink
updates to Example_HuggingFace_Mistral_Transformer.ipynb (#164)
Browse files Browse the repository at this point in the history
  • Loading branch information
briane412 authored Apr 30, 2024
1 parent 6ba1673 commit 04bd93c
Showing 1 changed file with 34 additions and 35 deletions.
69 changes: 34 additions & 35 deletions colab-notebooks/Example_HuggingFace_Mistral_Transformer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,32 +33,31 @@
"source": [
"#@title Install all dependencies (run only once per session)\n",
"\n",
"!nvidia-smi\n",
"# Run the following command only if a nvidia driver is installed\n",
"%nvidia-smi\n",
"\n",
"!pip install miditok\n",
"!pip install symusic\n",
"!pip install torch\n",
"!pip install torchtoolkit\n",
"!pip install transformers\n",
"!pip install accelerate\n",
"!pip install evaluate\n",
"!pip install tqdm\n",
"!pip install scikit-learn\n",
"!pip install tensorboard\n",
"%pip install miditok\n",
"%pip install symusic\n",
"%pip install torch\n",
"%pip install transformers\n",
"%pip install accelerate\n",
"%pip install evaluate\n",
"%pip install tensorboard\n",
"%pip install scikit-learn\n",
"\n",
"!wget https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip\n",
"!unzip 'maestro-v3.0.0-midi.zip'\n",
"!rm 'maestro-v3.0.0-midi.zip'\n",
"!mv 'maestro-v3.0.0' 'Maestro'\n",
"%wget https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip\n",
"%unzip 'maestro-v3.0.0-midi.zip'\n",
"%rm 'maestro-v3.0.0-midi.zip'\n",
"%mv 'maestro-v3.0.0' 'Maestro'\n",
"\n",
"from copy import deepcopy\n",
"from pathlib import Path\n",
"from random import shuffle\n",
"\n",
"from evaluate import load as load_metric\n",
"from miditok import REMI, TokenizerConfig\n",
"from miditok.pytorch_data import DatasetMIDI, DataCollator, split_midis_for_training\n",
"from miditok.data_augmentation import augment_midi_dataset\n",
"from miditok.pytorch_data import DatasetMIDI, DataCollator, split_files_for_training\n",
"from miditok.data_augmentation import augment_dataset\n",
"from torch import Tensor, argmax\n",
"from torch.utils.data import DataLoader\n",
"from torch.cuda import is_available as cuda_available, is_bf16_supported\n",
Expand Down Expand Up @@ -105,8 +104,8 @@
"# Creates the tokenizer\n",
"tokenizer = REMI(config)\n",
"\n",
"# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 10k tokens\n",
"midi_paths = list(Path(\"Maestro\").glob(\"**/*.mid\")) + list(Path(\"Maestro\").glob(\"**/*.midi\"))\n",
"# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 30k tokens\n",
"midi_paths = list(Path(\"Maestro\").resolve().glob(\"**/*.mid\")) + list(Path(\"Maestro\").resolve().glob(\"**/*.midi\"))\n",
"tokenizer.train(\n",
" vocab_size=30000,\n",
" files_paths=midi_paths,\n",
Expand Down Expand Up @@ -140,13 +139,13 @@
"midi_paths_train = midi_paths[num_files_valid + num_files_test:]\n",
"\n",
"# Chunk MIDIs and perform data augmentation on each subset independently\n",
"for files_paths, subset_name in {\n",
"for files_paths, subset_name in (\n",
" (midi_paths_train, \"train\"), (midi_paths_valid, \"valid\"), (midi_paths_test, \"test\")\n",
"}:\n",
"):\n",
"\n",
" # Split the MIDIs into chunks of sizes approximatly about 1024 tokens\n",
" # Split the MIDIs into chunks of sizes approximately about 1024 tokens\n",
" subset_chunks_dir = Path(f\"Maestro_{subset_name}\")\n",
" split_midis_for_training(\n",
" split_files_for_training(\n",
" files_paths=files_paths,\n",
" tokenizer=tokenizer,\n",
" save_dir=subset_chunks_dir,\n",
Expand All @@ -155,7 +154,7 @@
" )\n",
"\n",
" # Perform data augmentation\n",
" augment_midi_dataset(\n",
" augment_dataset(\n",
" subset_chunks_dir,\n",
" pitch_offsets=[-12, 12],\n",
" velocity_offsets=[-4, 4],\n",
Expand Down Expand Up @@ -187,7 +186,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -268,7 +267,7 @@
" learning_rate=1e-4,\n",
" weight_decay=0.01,\n",
" max_grad_norm=3.0,\n",
" max_steps=100000,\n",
" max_steps=20000,\n",
" lr_scheduler_type=\"cosine_with_restarts\",\n",
" warmup_ratio=0.3,\n",
" log_level=\"debug\",\n",
Expand All @@ -290,7 +289,7 @@
" gradient_checkpointing=True,\n",
")\n",
"\n",
"collator = DataCollator(tokenizer[\"PAD_None\"])\n",
"collator = DataCollator(tokenizer[\"PAD_None\"], copy_inputs_as_labels=True)\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_config,\n",
Expand Down Expand Up @@ -329,15 +328,15 @@
"source": [
"(gen_results_path := Path('gen_res')).mkdir(parents=True, exist_ok=True)\n",
"generation_config = GenerationConfig(\n",
" max_new_tokens=512, # extends samples by 512 tokens\n",
" num_beams=1, # no beam search\n",
" do_sample=True, # but sample instead\n",
" max_new_tokens=200, # extends samples by 200 tokens\n",
" num_beams=1, # no beam search\n",
" do_sample=True, # but sample instead\n",
" temperature=0.9,\n",
" top_k=15,\n",
" top_p=0.95,\n",
" epsilon_cutoff=3e-4,\n",
" eta_cutoff=1e-3,\n",
" pad_token_id=config.padding_token_id,\n",
" pad_token_id=tokenizer.pad_token_id,\n",
")\n",
"\n",
"# Here the sequences are padded to the left, so that the last token along the time dimension\n",
Expand All @@ -361,10 +360,10 @@
" tokens = [seq.tolist() for seq in tokens]\n",
" for tok_seq in tokens[1:]:\n",
" _midi = tokenizer.decode([deepcopy(tok_seq)])\n",
" midi.instruments.append(_midi.instruments[0])\n",
" midi.instruments[0].name = f'Continuation of original sample ({len(generated)} tokens)'\n",
" midi.instruments[1].name = f'Original sample ({len(prompt)} tokens)'\n",
" midi.instruments[2].name = f'Original sample and continuation'\n",
" midi.tracks.append(_midi.tracks[0])\n",
" midi.tracks[0].name = f'Continuation of original sample ({len(generated)} tokens)'\n",
" midi.tracks[1].name = f'Original sample ({len(prompt)} tokens)'\n",
" midi.tracks[2].name = f'Original sample and continuation'\n",
" midi.dump_midi(gen_results_path / f'{count}.mid')\n",
" tokenizer.save_tokens(tokens, gen_results_path / f'{count}.json') \n",
"\n",
Expand Down

0 comments on commit 04bd93c

Please sign in to comment.