From 7b6dcf098189dc3fa9e18237b06462f57910988d Mon Sep 17 00:00:00 2001 From: bachvudinh Date: Fri, 12 Jul 2024 20:21:47 +0700 Subject: [PATCH 1/3] update requirement --- tests/inference_vllm.py | 47 ----------------------------------------- tests/requirements.txt | 2 +- 2 files changed, 1 insertion(+), 48 deletions(-) delete mode 100644 tests/inference_vllm.py diff --git a/tests/inference_vllm.py b/tests/inference_vllm.py deleted file mode 100644 index f080a5b..0000000 --- a/tests/inference_vllm.py +++ /dev/null @@ -1,47 +0,0 @@ -from accelerate.utils import merge_fsdp_weights -from huggingface_hub import snapshot_download -from vllm import LLM, SamplingParams -from transformers import AutoTokenizer -from datasets import load_dataset -import pandas as pd -# Global variable -save_dir_output = 'output/Jan-Llama3-0708-Result.csv' -sampling_params = SamplingParams(temperature=0.0, max_tokens=200) -local_dir = "jan-hq/Jan-Llama3-0708" -snapshot_download("jan-hq/Jan-Llama3-0708", local_dir=local_dir, max_workers=64) -# Model loading using vllm -tokenizer = AutoTokenizer.from_pretrained("jan-hq/llama-3-sound-init") -llm = LLM(local_dir, tokenizer="jan-hq/llama-3-sound-init") -dataset = load_dataset("jan-hq/instruction-speech-conversation-test", cache_dir="/.cache/")['train'] -def vllm_inference(sample_id): - sound_messages = dataset[sample_id]['sound_convo'][0] - expected_output_messages = dataset[sample_id]['sound_convo'][1] - - sound_input_str = tokenizer.apply_chat_template([sound_messages], tokenize=False, add_generation_prompt=True) - text_input_str = dataset[sample_id]['prompt'] - output_based_on_text = tokenizer.apply_chat_template([expected_output_messages], tokenize=False) - - outputs = llm.generate(sound_input_str, sampling_params) - output_based_on_text = outputs[0].outputs[0].text - output_token_ids = outputs[0].outputs[0].token_ids - - print("-"*50) - print("Text input: ", text_input_str) - print("-"*50) - print("Text output: ", output_str) - print("-"*50) - print("Expected output: ", expected_output_str) - print("-"*50) - print("Output token ids: ", output_token_ids) - print("-"*50) - - return text_input_str, output_based_on_sound, output_based_on_text, output_token_ids -ouput_df = pd.DataFrame() -for i in range(len(dataset)): - text_input_str, output_based_on_sound, output_based_on_text, output_token_ids = vllm_inference(i) - # add to dictionary - output_df['text_input'].append(text_input_str) - output_df['output_based_on_sound'].append(output_based_on_sound) - output_df['output_based_on_text'].append(output_based_on_text) - output_df['output_token_ids'].append(output_token_ids) -output_df.to_csv(save_dir_output, index=False, encoding='utf-8') \ No newline at end of file diff --git a/tests/requirements.txt b/tests/requirements.txt index eb01ff8..8906e66 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,5 @@ datasets==2.20.0 -torch=2.2.0 +torch=2.3.0 transformers vllm huggingface_hub==0.23.4 From aa2462e148d709664320d26d489b76468d809ce4 Mon Sep 17 00:00:00 2001 From: bachvudinh Date: Sat, 13 Jul 2024 11:05:23 +0000 Subject: [PATCH 2/3] refactor and add clear logging for test cases --- tests/README.md | 292 ++++++++++++++++++++++++++++++++++++----- tests/requirements.txt | 2 +- tests/test_case.py | 115 +++++++++++++--- 3 files changed, 356 insertions(+), 53 deletions(-) diff --git a/tests/README.md b/tests/README.md index f4f5e02..25af925 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,33 +1,259 @@ -# research -# Unit-test -## Installing Requirements and Running Tests -1. Install the required packages: - ```bash - pip install -r requirements.txt - ``` -2. Run the test suite: - ```bash - python test_case.py --model_dir "jan-hq/Jan-Llama3-0708" \\ - --mode "audio" \\ - --num_rows 100 \\ - ``` -## Test Configuration - -- The test suite uses the following model and dataset: -- Model: "jan-hq/Jan-Llama3-0708" -- Tokenizer: "jan-hq/llama-3-sound-init" -- Dataset: "jan-hq/instruction-speech-conversation-test" - -## What the Tests Cover - -1. Output validation (non-empty, correct type) -2. Token ID validation -3. Input-output relevance using BLEU -4. Special token handling -5. Numerical stability (NaN checks) -6. Check if EOS token are unique and at the end of the generated ids - -## Continuous Integration - -- This test suite can be integrated into CI/CD pipelines. -- model download and inference can take significant time. \ No newline at end of file +--- +datasets: +- jan-hq/instruction-speech-v1 +language: +- en +license: apache-2.0 +tags: +- sound language model +--- + +## Model Details + +We have developed and released the family [Jan-Llama3](https://huggingface.co/collections/jan-hq/jan-llama3-668e4dad446c8736208dca4f). This family is natively understanding audio and text input. + +We continue to expand [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) with sound understanding capabilities by leveraging 700M tokens [Instruction Speech v1](https://huggingface.co/datasets/jan-hq/instruction-speech-v1) dataset. + +**Model developers** Homebrew Research. + +**Input** Text and sound. + +**Output** Text. + +**Model Architecture** Llama-3. + +**Language(s):** English. + +## Intended Use + +**Intended Use Cases** This family is primarily intended for research applications. This version aims to further improve the LLM on sound understanding capabilities. + +**Out-of-scope** The use of Llama-3-Sound in any manner that violates applicable laws or regulations is strictly prohibited. + +## How to Get Started with the Model + +First, we need to convert the audio file to sound tokens + +```python +import torch +import torchaudio +from encodec import EncodecModel +from encodec.utils import convert_audio + +def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"): + # Initialize Encodec + model = EncodecModel.encodec_model_24khz() + model.set_target_bandwidth(target_bandwidth) + model.to(device) + + # Load and preprocess audio + wav, sr = torchaudio.load(audio_path) + wav = convert_audio(wav, sr, model.sample_rate, model.channels) + wav = wav.unsqueeze(0).to(device) + + # Encode audio + with torch.no_grad(): + encoded_frames = model.encode(wav) + codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) + + # Flatten codes + audio_code1, audio_code2 = codes[0][0], codes[0][1] + flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist() + + # Convert to sound tokens + result = ''.join(f'<|sound_{num}|>' for num in flatten_tokens) + return f'<|sound_start|>{result}<|sound_end|>' + +# Usage +sound_tokens = audio_to_sound_tokens("/path/to/your/audio/file") +``` + +Then, we can inference the model the same as any other LLM. + +```python +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline + +def setup_pipeline(model_path, use_4bit=True): + tokenizer = AutoTokenizer.from_pretrained(model_path) + + model_kwargs = {"device_map": "auto"} + + if use_4bit: + model_kwargs["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + ) + + model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs) + + return pipeline("text-generation", model=model, tokenizer=tokenizer) + +def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False): + generation_args = { + "max_new_tokens": max_new_tokens, + "return_full_text": False, + "temperature": temperature, + "do_sample": do_sample, + } + + output = pipe(messages, **generation_args) + return output[0]['generated_text'] + +# Usage +llm_path = "jan-hq/Jan-Llama3-0708" +pipe = setup_pipeline(llm_path, use_4bit=True) +messages = [ + {"role": "user", "content": sound_tokens}, +] +generated_text = generate_text(pipe, messages) +print(generated_text) +``` + +## Training process +**Training Metrics Image**: Below is a snapshot of the training loss curve visualized. + +![train_loss_curve/png](https://cdn-uploads.huggingface.co/production/uploads/65713d70f56f9538679e5a56/9bv-kpnqrTxaBhiYrVHN7.png) + +### Hardware + +**GPU Configuration**: Cluster of 8x NVIDIA H100-SXM-80GB. +**GPU Usage**: + - **Continual Training**: 8 hours. + +### Training Arguments + +| Parameter | Continual Training | +|----------------------------|-------------------------| +| **Epoch** | 1 | +| **Global batch size** | 128 | +| **Learning Rate** | 5e-5 | +| **Learning Scheduler** | Cosine with warmup | +| **Optimizer** | [Adam-mini](https://arxiv.org/abs/2406.16793) | +| **Warmup Ratio** | 0.1 | +| **Weight Decay** | 0.01 | +| **beta1** | 0.9 | +| **beta2** | 0.98 | +| **epsilon** | 1e-6 | +| **Gradient Cliping** | 1.0 | + +### + Accelerate FSDP Config + +``` +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: FSDP +downcast_bf16: 'no' +enable_cpu_affinity: true +fsdp_config: + fsdp_activation_checkpointing: true + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch: BACKWARD_PRE + fsdp_cpu_ram_efficient_loading: true + fsdp_forward_prefetch: false + fsdp_offload_params: false + fsdp_sharding_strategy: FULL_SHARD + fsdp_state_dict_type: SHARDED_STATE_DICT + fsdp_sync_module_states: true + fsdp_use_orig_params: false +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + +## Examples + +1. Good example: +``` +# Question: +Does the following review have a positive or negative opinion of the movie? + +i thought the relationships were wonderful , +Options: A). negative; B). positive; + +# Expected Output: +The review has a B). positive opinion about the movie because they think the relationships were wonderful. + +# Model's Output: +The review is positive. +``` + +2. Misunderstanding example: +``` +# Question: +Translate the following sentence to Russian: +6 Work in JOBS and careers after 40 + +Russian: + +# Expected Output: +6 Работать в РАБОТАХ и карьере после 40 + +Russian (Cyrillic): 6 Работать в РАБОТАХ и карьере после 40 + +Let me explain this to you like you're five years old: This sentence means that people can still work and have careers, even if they're older than 40 years old. The words are in a different language, called Russian. +# Model's Output: +В случае, когда вы хотите перевести фразу "The first step is to identify the language of the text." на русский язык, вы можете использовать следующую фразу: "Первый шаг - определить язык текста." +``` + +3. Off-tracked example: +``` +# Quesion: +Answer the following question: - The roots of the growing plants start to break up the rock - The plant acids dissolve the rock - The rock is broken down into smaller pieces - Erosion begins. What might be the first step of the process? +Answer: + +# Expected Output: +The first step of the process might be: The roots of the growing plants start to break up the rock. + +# Model's Output: +Step 1: Identify the key elements in the premise and hypothesis. + +Premise: "A man in a blue shirt and a red hat is holding a sign." +Key elements: man, blue shirt, red hat, holding, sign + +Hypothesis: "A man is holding a sign." +Key elements: man, holding, sign + +Step 2: Compare the key elements in the premise and hypothesis. + +In the premise, we have a man wearing a blue shirt and a red hat, while in the hypothesis, we only have a man. The premise also mentions that the man is holding a sign, which is also mentioned in the hypothesis. + +Step 3: Determine if the hypothesis is entailed by the premise. + +Since the hypothesis states that a man is holding a sign, and the premise confirms that a man is holding a sign, we can conclude that the hypothesis is entailed by the premise. The additional information about the man's clothing in the premise does not contradict or negate the +``` + +Despite being undertrained, the model demonstrates an emerging grasp of sound-text semantics. + + +## Citation Information + +**BibTeX:** + +``` +@article{Llama-3-Sound: Sound Instruction LLM 2024, + title={Llama-3-Sound}, + author={Homebrew Research}, + year=2024, + month=July}, + url={https://huggingface.co/jan-hq/Jan-Llama3-0708} +``` + +## Acknowledgement + +- **[WhisperSpeech](https://github.com/collabora/WhisperSpeech)** + +- **[Encodec](https://github.com/facebookresearch/encodec)** + +- **[Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)** \ No newline at end of file diff --git a/tests/requirements.txt b/tests/requirements.txt index 8906e66..e41cf80 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,5 +1,5 @@ datasets==2.20.0 -torch=2.3.0 +torch==2.3.0 transformers vllm huggingface_hub==0.23.4 diff --git a/tests/test_case.py b/tests/test_case.py index d77ee2d..6743849 100644 --- a/tests/test_case.py +++ b/tests/test_case.py @@ -9,12 +9,89 @@ from nltk.translate.bleu_score import sentence_bleu, corpus_bleu import argparse import os +import sys +from io import StringIO +import time +# Decorator Class +class CustomTestResult(unittest.TestResult): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.successes = [] + + def addSuccess(self, test): + super().addSuccess(test) + self.successes.append(test) + +class CustomTestRunner(unittest.TextTestRunner): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.stream = StringIO() + self.results = [] + + def run(self, test): + result = CustomTestResult() + start_time = time.time() + test(result) + time_taken = time.time() - start_time + self.results.append((result, time_taken)) + return result + + def print_results(self): + print("\n=== Test Results ===") + total_tests = 0 + total_successes = 0 + total_failures = 0 + total_errors = 0 + total_time = 0 + + for result, time_taken in self.results: + total_tests += result.testsRun + total_successes += len(result.successes) + total_failures += len(result.failures) + total_errors += len(result.errors) + total_time += time_taken + + print(f"Ran {total_tests} tests in {total_time:.3f} seconds") + print(f"Successes: {total_successes}") + print(f"Failures: {total_failures}") + print(f"Errors: {total_errors}") + + print("\nDetailed Results:") + for result, time_taken in self.results: + # todo: add time taken for each test + for test in result.successes: + print(f"PASS: {test._testMethodName}") + for test, _ in result.failures: + print(f"FAIL: {test._testMethodName}") + for test, _ in result.errors: + test_name = getattr(test, '_testMethodName', str(test)) + print(f"ERROR: {test_name}") + + if total_failures > 0 or total_errors > 0: + print("\nFailure and Error Details:") + for result, _ in self.results: + for test, traceback in result.failures: + print(f"\nFAILURE: {test._testMethodName}") + print(traceback) + for test, traceback in result.errors: + test_name = getattr(test, '_testMethodName', str(test)) + print(f"\nERROR: {test_name}") + print(traceback) + else: + print("\nAll tests passed successfully!") + +def test_name(name): + def decorator(func): + func.__name__ = name + return func + return decorator def parse_arguments(): parser = argparse.ArgumentParser(description="Run inference on a Sound-To-Text Model.") - parser.add_argument("--model_dir", type=str, required=True, help="Hugging Face model link or local_dir") - parser.add_argument("--model_save_dir", type=str, required=True, help="Local directory that model is saved") + parser.add_argument("--model_dir", type=str, default="jan-hq/Jan-Llama3-0708", help="Hugging Face model link or local_dir") + parser.add_argument("--max_length", type=int, default=1024, help="Maximum length of the output") parser.add_argument("--data_dir", type=str, required=True, help="Hugging Face model repository link or Data path") + parser.add_argument("--cache_dir", type=str, default=".", help="Absolute path to save the model and dataset") parser.add_argument("--mode", type=str, default="audio", help="Mode of the model (audio or text)") parser.add_argument("--num_rows", type=int, default=5, help="Number of dataset rows to process") parser.add_argument("--output_file", type=str, default="output/", help="Output file path") @@ -29,23 +106,20 @@ def setUpClass(cls): cls.save_dir_output = f'{args.output_file}/{model_name}-{args.mode}-Result.csv' if not os.path.exists(args.output_file): os.makedirs(args.output_file) - cls.sampling_params = SamplingParams(temperature=0.0, max_tokens=1024, skip_special_tokens=False) - model_dir = "" - if os.path.exists(args.model_save_dir): - model_dir = args.model_save_dir + cls.sampling_params = SamplingParams(temperature=0.0, max_tokens=args.max_length, skip_special_tokens=False) + # Download model + model_save_dir = os.path.join(args.cache_dir, args.model_dir) + if not os.path.exists(model_save_dir): + snapshot_download(args.model_dir, local_dir=model_save_dir, max_workers=64) else: - # Download model - if not os.path.exists(args.model_dir): - snapshot_download(args.model_dir, local_dir=args.model_dir, max_workers=64) - else: - print(f"Found {args.model_dir}. Skipping download.") - model_dir = args.model_dir + print(f"Found {model_save_dir}. Skipping download.") # Model loading using vllm - cls.tokenizer = AutoTokenizer.from_pretrained(model_dir) - cls.llm = LLM(model_dir, tokenizer=model_dir) + cls.tokenizer = AutoTokenizer.from_pretrained(model_save_dir) + cls.llm = LLM(model_save_dir, tokenizer=model_save_dir) # Load dataset - cls.dataset = load_dataset(args.data_dir, cache_dir=".cache/")['train'] + data_save_dir = os.path.join(args.cache_dir, args.data_dir) + cls.dataset = load_dataset(args.data_dir, split='train') cls.num_rows = min(args.num_rows, len(cls.dataset)) cls.inference_results = [] if args.mode == "audio": @@ -88,6 +162,7 @@ def vllm_qna_inference(self, sample_id): # return input_str, output_based_on_input, expected_output_str, output_token_ids + @test_name("Output validation (non-empty, correct type)") def test_model_output(self): for text_input_str, output_based_on_sound, expected_output_str, output_token_ids in self.inference_results: # Test 1: Check if output is not empty @@ -104,7 +179,7 @@ def test_model_output(self): # output_words = set(output_based_on_sound.lower().split()) # relevance_score = corpus_bleu(output_words, reference_words) # self.assertGreater(relevance_score, 0.3) - + @test_name("Test Special Tokens Handling") def test_special_tokens(self): # Test 5: Check if special tokens are handled correctly special_tokens = [self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token] @@ -119,12 +194,12 @@ def test_special_tokens(self): # results = [self.inference_results[0][1] for _ in range(3)] # self.assertEqual(results[0], results[1]) # self.assertEqual(results[1], results[2]) - + @test_name("Test for NaN outputs") def test_no_nan_outputs(self): # Test 7: Check for NaN outputs for _, output, _, _ in self.inference_results: self.assertFalse(any(np.isnan(float(word)) for word in output.split() if word.replace('.', '').isdigit())) - + @test_name("Test for EOS token generation") def test_eos_token_generation(self): # Test 8: Check if EOS token is generated for _, output_based_on_sound, _, output_token_ids in self.inference_results: @@ -142,4 +217,6 @@ def test_eos_token_generation(self): if __name__ == "__main__": - unittest.main(argv=['first-arg-is-ignored'], exit=False) \ No newline at end of file + runner = CustomTestRunner(stream=sys.stdout, verbosity=2) + unittest.main(argv=['first-arg-is-ignored'], exit=False, testRunner=runner) + runner.print_results() \ No newline at end of file From c916c759a81fb224b7a84376d71bf741e0c370a9 Mon Sep 17 00:00:00 2001 From: bachvudinh Date: Sat, 13 Jul 2024 11:08:04 +0000 Subject: [PATCH 3/3] correct readme --- tests/README.md | 294 ++++++------------------------------------------ 1 file changed, 35 insertions(+), 259 deletions(-) diff --git a/tests/README.md b/tests/README.md index 25af925..b902279 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,259 +1,35 @@ ---- -datasets: -- jan-hq/instruction-speech-v1 -language: -- en -license: apache-2.0 -tags: -- sound language model ---- - -## Model Details - -We have developed and released the family [Jan-Llama3](https://huggingface.co/collections/jan-hq/jan-llama3-668e4dad446c8736208dca4f). This family is natively understanding audio and text input. - -We continue to expand [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) with sound understanding capabilities by leveraging 700M tokens [Instruction Speech v1](https://huggingface.co/datasets/jan-hq/instruction-speech-v1) dataset. - -**Model developers** Homebrew Research. - -**Input** Text and sound. - -**Output** Text. - -**Model Architecture** Llama-3. - -**Language(s):** English. - -## Intended Use - -**Intended Use Cases** This family is primarily intended for research applications. This version aims to further improve the LLM on sound understanding capabilities. - -**Out-of-scope** The use of Llama-3-Sound in any manner that violates applicable laws or regulations is strictly prohibited. - -## How to Get Started with the Model - -First, we need to convert the audio file to sound tokens - -```python -import torch -import torchaudio -from encodec import EncodecModel -from encodec.utils import convert_audio - -def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"): - # Initialize Encodec - model = EncodecModel.encodec_model_24khz() - model.set_target_bandwidth(target_bandwidth) - model.to(device) - - # Load and preprocess audio - wav, sr = torchaudio.load(audio_path) - wav = convert_audio(wav, sr, model.sample_rate, model.channels) - wav = wav.unsqueeze(0).to(device) - - # Encode audio - with torch.no_grad(): - encoded_frames = model.encode(wav) - codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1) - - # Flatten codes - audio_code1, audio_code2 = codes[0][0], codes[0][1] - flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist() - - # Convert to sound tokens - result = ''.join(f'<|sound_{num}|>' for num in flatten_tokens) - return f'<|sound_start|>{result}<|sound_end|>' - -# Usage -sound_tokens = audio_to_sound_tokens("/path/to/your/audio/file") -``` - -Then, we can inference the model the same as any other LLM. - -```python -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline - -def setup_pipeline(model_path, use_4bit=True): - tokenizer = AutoTokenizer.from_pretrained(model_path) - - model_kwargs = {"device_map": "auto"} - - if use_4bit: - model_kwargs["quantization_config"] = BitsAndBytesConfig( - load_in_4bit=True, - bnb_4bit_compute_dtype=torch.bfloat16, - bnb_4bit_use_double_quant=True, - bnb_4bit_quant_type="nf4", - ) - - model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs) - - return pipeline("text-generation", model=model, tokenizer=tokenizer) - -def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False): - generation_args = { - "max_new_tokens": max_new_tokens, - "return_full_text": False, - "temperature": temperature, - "do_sample": do_sample, - } - - output = pipe(messages, **generation_args) - return output[0]['generated_text'] - -# Usage -llm_path = "jan-hq/Jan-Llama3-0708" -pipe = setup_pipeline(llm_path, use_4bit=True) -messages = [ - {"role": "user", "content": sound_tokens}, -] -generated_text = generate_text(pipe, messages) -print(generated_text) -``` - -## Training process -**Training Metrics Image**: Below is a snapshot of the training loss curve visualized. - -![train_loss_curve/png](https://cdn-uploads.huggingface.co/production/uploads/65713d70f56f9538679e5a56/9bv-kpnqrTxaBhiYrVHN7.png) - -### Hardware - -**GPU Configuration**: Cluster of 8x NVIDIA H100-SXM-80GB. -**GPU Usage**: - - **Continual Training**: 8 hours. - -### Training Arguments - -| Parameter | Continual Training | -|----------------------------|-------------------------| -| **Epoch** | 1 | -| **Global batch size** | 128 | -| **Learning Rate** | 5e-5 | -| **Learning Scheduler** | Cosine with warmup | -| **Optimizer** | [Adam-mini](https://arxiv.org/abs/2406.16793) | -| **Warmup Ratio** | 0.1 | -| **Weight Decay** | 0.01 | -| **beta1** | 0.9 | -| **beta2** | 0.98 | -| **epsilon** | 1e-6 | -| **Gradient Cliping** | 1.0 | - -### - Accelerate FSDP Config - -``` -compute_environment: LOCAL_MACHINE -debug: false -distributed_type: FSDP -downcast_bf16: 'no' -enable_cpu_affinity: true -fsdp_config: - fsdp_activation_checkpointing: true - fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP - fsdp_backward_prefetch: BACKWARD_PRE - fsdp_cpu_ram_efficient_loading: true - fsdp_forward_prefetch: false - fsdp_offload_params: false - fsdp_sharding_strategy: FULL_SHARD - fsdp_state_dict_type: SHARDED_STATE_DICT - fsdp_sync_module_states: true - fsdp_use_orig_params: false -machine_rank: 0 -main_training_function: main -mixed_precision: bf16 -num_machines: 1 -num_processes: 8 -rdzv_backend: static -same_network: true -tpu_env: [] -tpu_use_cluster: false -tpu_use_sudo: false -use_cpu: false -``` - -## Examples - -1. Good example: -``` -# Question: -Does the following review have a positive or negative opinion of the movie? - -i thought the relationships were wonderful , -Options: A). negative; B). positive; - -# Expected Output: -The review has a B). positive opinion about the movie because they think the relationships were wonderful. - -# Model's Output: -The review is positive. -``` - -2. Misunderstanding example: -``` -# Question: -Translate the following sentence to Russian: -6 Work in JOBS and careers after 40 - -Russian: - -# Expected Output: -6 Работать в РАБОТАХ и карьере после 40 - -Russian (Cyrillic): 6 Работать в РАБОТАХ и карьере после 40 - -Let me explain this to you like you're five years old: This sentence means that people can still work and have careers, even if they're older than 40 years old. The words are in a different language, called Russian. -# Model's Output: -В случае, когда вы хотите перевести фразу "The first step is to identify the language of the text." на русский язык, вы можете использовать следующую фразу: "Первый шаг - определить язык текста." -``` - -3. Off-tracked example: -``` -# Quesion: -Answer the following question: - The roots of the growing plants start to break up the rock - The plant acids dissolve the rock - The rock is broken down into smaller pieces - Erosion begins. What might be the first step of the process? -Answer: - -# Expected Output: -The first step of the process might be: The roots of the growing plants start to break up the rock. - -# Model's Output: -Step 1: Identify the key elements in the premise and hypothesis. - -Premise: "A man in a blue shirt and a red hat is holding a sign." -Key elements: man, blue shirt, red hat, holding, sign - -Hypothesis: "A man is holding a sign." -Key elements: man, holding, sign - -Step 2: Compare the key elements in the premise and hypothesis. - -In the premise, we have a man wearing a blue shirt and a red hat, while in the hypothesis, we only have a man. The premise also mentions that the man is holding a sign, which is also mentioned in the hypothesis. - -Step 3: Determine if the hypothesis is entailed by the premise. - -Since the hypothesis states that a man is holding a sign, and the premise confirms that a man is holding a sign, we can conclude that the hypothesis is entailed by the premise. The additional information about the man's clothing in the premise does not contradict or negate the -``` - -Despite being undertrained, the model demonstrates an emerging grasp of sound-text semantics. - - -## Citation Information - -**BibTeX:** - -``` -@article{Llama-3-Sound: Sound Instruction LLM 2024, - title={Llama-3-Sound}, - author={Homebrew Research}, - year=2024, - month=July}, - url={https://huggingface.co/jan-hq/Jan-Llama3-0708} -``` - -## Acknowledgement - -- **[WhisperSpeech](https://github.com/collabora/WhisperSpeech)** - -- **[Encodec](https://github.com/facebookresearch/encodec)** - -- **[Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)** \ No newline at end of file +# research +# Unit-test +## Installing Requirements and Running Tests +1. Install the required packages: + ```bash + pip install -r requirements.txt + ``` +2. Run the test suite: + ```bash + python test_case.py --model_dir "jan-hq/Jan-Llama3-0708" \\ + --max_length 1024 \\ + --data_dir "jan-hq/instruction-speech-conversation-test" \\ + --mode "audio" \\ + --num_rows 5 \\ + ``` +## Test Configuration + +- The test suite uses the following model and dataset: +- Model: "jan-hq/Jan-Llama3-0708" +- Tokenizer: "jan-hq/llama-3-sound-init" +- Dataset: "jan-hq/instruction-speech-conversation-test" + +## What the Tests Cover + +1. Output validation (non-empty, correct type) +2. Token ID validation +3. Input-output relevance using BLEU +4. Special token handling +5. Numerical stability (NaN checks) +6. Check if EOS token are unique and at the end of the generated ids + +## Continuous Integration + +- This test suite can be integrated into CI/CD pipelines. +- model download and inference can take significant time. \ No newline at end of file