Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

updates to Example_HuggingFace_Mistral_Transformer.ipynb #164

Merged
merged 1 commit into from
Apr 30, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 34 additions & 35 deletions colab-notebooks/Example_HuggingFace_Mistral_Transformer.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,32 +33,31 @@
"source": [
"#@title Install all dependencies (run only once per session)\n",
"\n",
"!nvidia-smi\n",
"# Run the following command only if a nvidia driver is installed\n",
"%nvidia-smi\n",
"\n",
"!pip install miditok\n",
"!pip install symusic\n",
"!pip install torch\n",
"!pip install torchtoolkit\n",
"!pip install transformers\n",
"!pip install accelerate\n",
"!pip install evaluate\n",
"!pip install tqdm\n",
"!pip install scikit-learn\n",
"!pip install tensorboard\n",
"%pip install miditok\n",
"%pip install symusic\n",
"%pip install torch\n",
"%pip install transformers\n",
"%pip install accelerate\n",
"%pip install evaluate\n",
"%pip install tensorboard\n",
"%pip install scikit-learn\n",
"\n",
"!wget https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip\n",
"!unzip 'maestro-v3.0.0-midi.zip'\n",
"!rm 'maestro-v3.0.0-midi.zip'\n",
"!mv 'maestro-v3.0.0' 'Maestro'\n",
"%wget https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip\n",
"%unzip 'maestro-v3.0.0-midi.zip'\n",
"%rm 'maestro-v3.0.0-midi.zip'\n",
"%mv 'maestro-v3.0.0' 'Maestro'\n",
"\n",
"from copy import deepcopy\n",
"from pathlib import Path\n",
"from random import shuffle\n",
"\n",
"from evaluate import load as load_metric\n",
"from miditok import REMI, TokenizerConfig\n",
"from miditok.pytorch_data import DatasetMIDI, DataCollator, split_midis_for_training\n",
"from miditok.data_augmentation import augment_midi_dataset\n",
"from miditok.pytorch_data import DatasetMIDI, DataCollator, split_files_for_training\n",
"from miditok.data_augmentation import augment_dataset\n",
"from torch import Tensor, argmax\n",
"from torch.utils.data import DataLoader\n",
"from torch.cuda import is_available as cuda_available, is_bf16_supported\n",
Expand Down Expand Up @@ -105,8 +104,8 @@
"# Creates the tokenizer\n",
"tokenizer = REMI(config)\n",
"\n",
"# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 10k tokens\n",
"midi_paths = list(Path(\"Maestro\").glob(\"**/*.mid\")) + list(Path(\"Maestro\").glob(\"**/*.midi\"))\n",
"# Trains the tokenizer with Byte Pair Encoding (BPE) to build the vocabulary, here 30k tokens\n",
"midi_paths = list(Path(\"Maestro\").resolve().glob(\"**/*.mid\")) + list(Path(\"Maestro\").resolve().glob(\"**/*.midi\"))\n",
"tokenizer.train(\n",
" vocab_size=30000,\n",
" files_paths=midi_paths,\n",
Expand Down Expand Up @@ -140,13 +139,13 @@
"midi_paths_train = midi_paths[num_files_valid + num_files_test:]\n",
"\n",
"# Chunk MIDIs and perform data augmentation on each subset independently\n",
"for files_paths, subset_name in {\n",
"for files_paths, subset_name in (\n",
" (midi_paths_train, \"train\"), (midi_paths_valid, \"valid\"), (midi_paths_test, \"test\")\n",
"}:\n",
"):\n",
"\n",
" # Split the MIDIs into chunks of sizes approximatly about 1024 tokens\n",
" # Split the MIDIs into chunks of sizes approximately about 1024 tokens\n",
" subset_chunks_dir = Path(f\"Maestro_{subset_name}\")\n",
" split_midis_for_training(\n",
" split_files_for_training(\n",
" files_paths=files_paths,\n",
" tokenizer=tokenizer,\n",
" save_dir=subset_chunks_dir,\n",
Expand All @@ -155,7 +154,7 @@
" )\n",
"\n",
" # Perform data augmentation\n",
" augment_midi_dataset(\n",
" augment_dataset(\n",
" subset_chunks_dir,\n",
" pitch_offsets=[-12, 12],\n",
" velocity_offsets=[-4, 4],\n",
Expand Down Expand Up @@ -187,7 +186,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -268,7 +267,7 @@
" learning_rate=1e-4,\n",
" weight_decay=0.01,\n",
" max_grad_norm=3.0,\n",
" max_steps=100000,\n",
" max_steps=20000,\n",
" lr_scheduler_type=\"cosine_with_restarts\",\n",
" warmup_ratio=0.3,\n",
" log_level=\"debug\",\n",
Expand All @@ -290,7 +289,7 @@
" gradient_checkpointing=True,\n",
")\n",
"\n",
"collator = DataCollator(tokenizer[\"PAD_None\"])\n",
"collator = DataCollator(tokenizer[\"PAD_None\"], copy_inputs_as_labels=True)\n",
"trainer = Trainer(\n",
" model=model,\n",
" args=training_config,\n",
Expand Down Expand Up @@ -329,15 +328,15 @@
"source": [
"(gen_results_path := Path('gen_res')).mkdir(parents=True, exist_ok=True)\n",
"generation_config = GenerationConfig(\n",
" max_new_tokens=512, # extends samples by 512 tokens\n",
" num_beams=1, # no beam search\n",
" do_sample=True, # but sample instead\n",
" max_new_tokens=200, # extends samples by 200 tokens\n",
" num_beams=1, # no beam search\n",
" do_sample=True, # but sample instead\n",
" temperature=0.9,\n",
" top_k=15,\n",
" top_p=0.95,\n",
" epsilon_cutoff=3e-4,\n",
" eta_cutoff=1e-3,\n",
" pad_token_id=config.padding_token_id,\n",
" pad_token_id=tokenizer.pad_token_id,\n",
")\n",
"\n",
"# Here the sequences are padded to the left, so that the last token along the time dimension\n",
Expand All @@ -361,10 +360,10 @@
" tokens = [seq.tolist() for seq in tokens]\n",
" for tok_seq in tokens[1:]:\n",
" _midi = tokenizer.decode([deepcopy(tok_seq)])\n",
" midi.instruments.append(_midi.instruments[0])\n",
" midi.instruments[0].name = f'Continuation of original sample ({len(generated)} tokens)'\n",
" midi.instruments[1].name = f'Original sample ({len(prompt)} tokens)'\n",
" midi.instruments[2].name = f'Original sample and continuation'\n",
" midi.tracks.append(_midi.tracks[0])\n",
" midi.tracks[0].name = f'Continuation of original sample ({len(generated)} tokens)'\n",
" midi.tracks[1].name = f'Original sample ({len(prompt)} tokens)'\n",
" midi.tracks[2].name = f'Original sample and continuation'\n",
" midi.dump_midi(gen_results_path / f'{count}.mid')\n",
" tokenizer.save_tokens(tokens, gen_results_path / f'{count}.json') \n",
"\n",
Expand Down
Loading