From 7b6dcf098189dc3fa9e18237b06462f57910988d Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Fri, 12 Jul 2024 20:21:47 +0700
Subject: [PATCH 1/3] update requirement

---
 tests/inference_vllm.py | 47 -----------------------------------------
 tests/requirements.txt  |  2 +-
 2 files changed, 1 insertion(+), 48 deletions(-)
 delete mode 100644 tests/inference_vllm.py

diff --git a/tests/inference_vllm.py b/tests/inference_vllm.py
deleted file mode 100644
index f080a5b..0000000
--- a/tests/inference_vllm.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from accelerate.utils import merge_fsdp_weights
-from huggingface_hub import snapshot_download
-from vllm import LLM, SamplingParams
-from transformers import AutoTokenizer
-from datasets import load_dataset
-import pandas as pd
-# Global variable
-save_dir_output = 'output/Jan-Llama3-0708-Result.csv'
-sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
-local_dir = "jan-hq/Jan-Llama3-0708"
-snapshot_download("jan-hq/Jan-Llama3-0708", local_dir=local_dir, max_workers=64)
-# Model loading using vllm
-tokenizer = AutoTokenizer.from_pretrained("jan-hq/llama-3-sound-init")
-llm = LLM(local_dir, tokenizer="jan-hq/llama-3-sound-init")
-dataset = load_dataset("jan-hq/instruction-speech-conversation-test", cache_dir="/.cache/")['train']
-def vllm_inference(sample_id):
-    sound_messages = dataset[sample_id]['sound_convo'][0]
-    expected_output_messages = dataset[sample_id]['sound_convo'][1]
-
-    sound_input_str = tokenizer.apply_chat_template([sound_messages], tokenize=False, add_generation_prompt=True)
-    text_input_str = dataset[sample_id]['prompt']
-    output_based_on_text = tokenizer.apply_chat_template([expected_output_messages], tokenize=False)
-
-    outputs = llm.generate(sound_input_str, sampling_params)
-    output_based_on_text = outputs[0].outputs[0].text
-    output_token_ids = outputs[0].outputs[0].token_ids
-
-    print("-"*50)
-    print("Text input: ", text_input_str)
-    print("-"*50)
-    print("Text output: ", output_str)
-    print("-"*50)
-    print("Expected output: ", expected_output_str)
-    print("-"*50)
-    print("Output token ids: ", output_token_ids)
-    print("-"*50)
-
-    return text_input_str, output_based_on_sound, output_based_on_text, output_token_ids
-ouput_df = pd.DataFrame()
-for i in range(len(dataset)):
-    text_input_str, output_based_on_sound, output_based_on_text, output_token_ids = vllm_inference(i)
-    # add to dictionary
-    output_df['text_input'].append(text_input_str)
-    output_df['output_based_on_sound'].append(output_based_on_sound)
-    output_df['output_based_on_text'].append(output_based_on_text)
-    output_df['output_token_ids'].append(output_token_ids)
-output_df.to_csv(save_dir_output, index=False, encoding='utf-8')
\ No newline at end of file
diff --git a/tests/requirements.txt b/tests/requirements.txt
index eb01ff8..8906e66 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,5 +1,5 @@
 datasets==2.20.0
-torch=2.2.0
+torch=2.3.0
 transformers
 vllm
 huggingface_hub==0.23.4

From aa2462e148d709664320d26d489b76468d809ce4 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Sat, 13 Jul 2024 11:05:23 +0000
Subject: [PATCH 2/3] refactor and add clear logging for test cases

---
 tests/README.md        | 292 ++++++++++++++++++++++++++++++++++++-----
 tests/requirements.txt |   2 +-
 tests/test_case.py     | 115 +++++++++++++---
 3 files changed, 356 insertions(+), 53 deletions(-)

diff --git a/tests/README.md b/tests/README.md
index f4f5e02..25af925 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -1,33 +1,259 @@
-# research
-# Unit-test 
-## Installing Requirements and Running Tests
-1. Install the required packages:
-    ```bash
-    pip install -r requirements.txt
-    ```
-2. Run the test suite:
-    ```bash
-    python test_case.py --model_dir "jan-hq/Jan-Llama3-0708" \\
-                        --mode "audio" \\
-                        --num_rows 100 \\ 
-    ```
-## Test Configuration
-
-- The test suite uses the following model and dataset:
-- Model: "jan-hq/Jan-Llama3-0708"
-- Tokenizer: "jan-hq/llama-3-sound-init"
-- Dataset: "jan-hq/instruction-speech-conversation-test"
-
-## What the Tests Cover
-
-1. Output validation (non-empty, correct type)
-2. Token ID validation
-3. Input-output relevance using BLEU
-4. Special token handling
-5. Numerical stability (NaN checks)
-6. Check if EOS token are unique and at the end of the generated ids
-
-## Continuous Integration
-
-- This test suite can be integrated into CI/CD pipelines.
-- model download and inference can take significant time.
\ No newline at end of file
+---
+datasets:
+- jan-hq/instruction-speech-v1
+language:
+- en
+license: apache-2.0
+tags:
+- sound language model
+---
+
+## Model Details
+
+We have developed and released the family [Jan-Llama3](https://huggingface.co/collections/jan-hq/jan-llama3-668e4dad446c8736208dca4f). This family is natively understanding audio and text input.
+
+We continue to expand [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) with sound understanding capabilities by leveraging 700M tokens [Instruction Speech v1](https://huggingface.co/datasets/jan-hq/instruction-speech-v1) dataset.
+
+**Model developers** Homebrew Research.
+
+**Input** Text and sound.
+
+**Output** Text.
+
+**Model Architecture** Llama-3.
+
+**Language(s):** English.
+
+## Intended Use
+
+**Intended Use Cases** This family is primarily intended for research applications. This version aims to further improve the LLM on sound understanding capabilities.
+
+**Out-of-scope** The use of Llama-3-Sound in any manner that violates applicable laws or regulations is strictly prohibited.
+
+## How to Get Started with the Model
+
+First, we need to convert the audio file to sound tokens
+
+```python
+import torch
+import torchaudio
+from encodec import EncodecModel
+from encodec.utils import convert_audio
+
+def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"):
+    # Initialize Encodec
+    model = EncodecModel.encodec_model_24khz()
+    model.set_target_bandwidth(target_bandwidth)
+    model.to(device)
+
+    # Load and preprocess audio
+    wav, sr = torchaudio.load(audio_path)
+    wav = convert_audio(wav, sr, model.sample_rate, model.channels)
+    wav = wav.unsqueeze(0).to(device)
+
+    # Encode audio
+    with torch.no_grad():
+        encoded_frames = model.encode(wav)
+    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
+
+    # Flatten codes
+    audio_code1, audio_code2 = codes[0][0], codes[0][1]
+    flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
+
+    # Convert to sound tokens
+    result = ''.join(f'<|sound_{num}|>' for num in flatten_tokens)
+    return f'<|sound_start|>{result}<|sound_end|>'
+
+# Usage
+sound_tokens = audio_to_sound_tokens("/path/to/your/audio/file")
+```
+
+Then, we can inference the model the same as any other LLM.
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
+
+def setup_pipeline(model_path, use_4bit=True):
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+
+    model_kwargs = {"device_map": "auto"}
+    
+    if use_4bit:
+        model_kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+
+    model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
+
+    return pipeline("text-generation", model=model, tokenizer=tokenizer)
+
+def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False):
+    generation_args = {
+        "max_new_tokens": max_new_tokens,
+        "return_full_text": False,
+        "temperature": temperature,
+        "do_sample": do_sample,
+    }
+
+    output = pipe(messages, **generation_args)
+    return output[0]['generated_text']
+
+# Usage
+llm_path = "jan-hq/Jan-Llama3-0708"
+pipe = setup_pipeline(llm_path, use_4bit=True)
+messages = [
+    {"role": "user", "content": sound_tokens},
+]
+generated_text = generate_text(pipe, messages)
+print(generated_text)
+```
+
+## Training process
+**Training Metrics Image**: Below is a snapshot of the training loss curve visualized.
+
+![train_loss_curve/png](https://cdn-uploads.huggingface.co/production/uploads/65713d70f56f9538679e5a56/9bv-kpnqrTxaBhiYrVHN7.png)
+
+### Hardware
+
+**GPU Configuration**: Cluster of 8x NVIDIA H100-SXM-80GB.
+**GPU Usage**:
+  - **Continual Training**: 8 hours.
+
+### Training Arguments
+
+| Parameter                  | Continual Training      | 
+|----------------------------|-------------------------|
+| **Epoch**                  | 1                       | 
+| **Global batch size**      | 128                     | 
+| **Learning Rate**          | 5e-5                    | 
+| **Learning Scheduler**     | Cosine with warmup      | 
+| **Optimizer**              | [Adam-mini](https://arxiv.org/abs/2406.16793)               | 
+| **Warmup Ratio**           | 0.1                     | 
+| **Weight Decay**           | 0.01                    |
+| **beta1**                  | 0.9                     | 
+| **beta2**                  | 0.98                    | 
+| **epsilon**                | 1e-6                    | 
+| **Gradient Cliping**       | 1.0                     | 
+
+###
+ Accelerate FSDP Config
+
+```
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: true
+fsdp_config:
+  fsdp_activation_checkpointing: true
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: false
+  fsdp_offload_params: false
+  fsdp_sharding_strategy: FULL_SHARD
+  fsdp_state_dict_type: SHARDED_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: false
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+```
+
+## Examples
+
+1. Good example:
+```
+# Question:
+Does the following review have a positive or negative opinion of the movie?
+
+i thought the relationships were wonderful , 
+Options: A). negative; B). positive;
+
+# Expected Output:
+The review has a B). positive opinion about the movie because they think the relationships were wonderful.
+
+# Model's Output:
+The review is positive.
+```
+
+2. Misunderstanding example:
+```
+# Question:
+Translate the following sentence to Russian:
+6 Work in JOBS and careers after 40
+
+Russian:
+
+# Expected Output:
+6 Работать в РАБОТАХ и карьере после 40
+
+Russian (Cyrillic): 6 Работать в РАБОТАХ и карьере после 40
+
+Let me explain this to you like you're five years old: This sentence means that people can still work and have careers, even if they're older than 40 years old. The words are in a different language, called Russian.
+# Model's Output:
+В случае, когда вы хотите перевести фразу "The first step is to identify the language of the text." на русский язык, вы можете использовать следующую фразу: "Первый шаг - определить язык текста."
+```
+
+3. Off-tracked example:
+```
+# Quesion:
+Answer the following question: -  The roots of the growing plants start to break up the rock - The plant acids dissolve the rock - The rock is broken down into smaller pieces - Erosion begins.  What might be the first step of the process?
+Answer:
+
+# Expected Output:
+The first step of the process might be: The roots of the growing plants start to break up the rock.
+
+# Model's Output:
+Step 1: Identify the key elements in the premise and hypothesis.
+
+Premise: "A man in a blue shirt and a red hat is holding a sign."
+Key elements: man, blue shirt, red hat, holding, sign
+
+Hypothesis: "A man is holding a sign."
+Key elements: man, holding, sign
+
+Step 2: Compare the key elements in the premise and hypothesis.
+
+In the premise, we have a man wearing a blue shirt and a red hat, while in the hypothesis, we only have a man. The premise also mentions that the man is holding a sign, which is also mentioned in the hypothesis.
+
+Step 3: Determine if the hypothesis is entailed by the premise.
+
+Since the hypothesis states that a man is holding a sign, and the premise confirms that a man is holding a sign, we can conclude that the hypothesis is entailed by the premise. The additional information about the man's clothing in the premise does not contradict or negate the
+```
+
+Despite being undertrained, the model demonstrates an emerging grasp of sound-text semantics.
+
+
+## Citation Information
+
+**BibTeX:**
+
+```
+@article{Llama-3-Sound: Sound Instruction LLM 2024,
+  title={Llama-3-Sound},
+  author={Homebrew Research},
+  year=2024,
+  month=July},
+  url={https://huggingface.co/jan-hq/Jan-Llama3-0708}
+```
+
+## Acknowledgement
+
+- **[WhisperSpeech](https://github.com/collabora/WhisperSpeech)**
+
+- **[Encodec](https://github.com/facebookresearch/encodec)**
+
+- **[Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)**
\ No newline at end of file
diff --git a/tests/requirements.txt b/tests/requirements.txt
index 8906e66..e41cf80 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,5 +1,5 @@
 datasets==2.20.0
-torch=2.3.0
+torch==2.3.0
 transformers
 vllm
 huggingface_hub==0.23.4
diff --git a/tests/test_case.py b/tests/test_case.py
index d77ee2d..6743849 100644
--- a/tests/test_case.py
+++ b/tests/test_case.py
@@ -9,12 +9,89 @@
 from nltk.translate.bleu_score import sentence_bleu, corpus_bleu
 import argparse
 import os
+import sys
+from io import StringIO
+import time
+# Decorator Class
+class CustomTestResult(unittest.TestResult):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.successes = []
+
+    def addSuccess(self, test):
+        super().addSuccess(test)
+        self.successes.append(test)
+
+class CustomTestRunner(unittest.TextTestRunner):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.stream = StringIO()
+        self.results = []
+
+    def run(self, test):
+        result = CustomTestResult()
+        start_time = time.time()
+        test(result)
+        time_taken = time.time() - start_time
+        self.results.append((result, time_taken))
+        return result
+
+    def print_results(self):
+        print("\n=== Test Results ===")
+        total_tests = 0
+        total_successes = 0
+        total_failures = 0
+        total_errors = 0
+        total_time = 0
+
+        for result, time_taken in self.results:
+            total_tests += result.testsRun
+            total_successes += len(result.successes)
+            total_failures += len(result.failures)
+            total_errors += len(result.errors)
+            total_time += time_taken
+
+        print(f"Ran {total_tests} tests in {total_time:.3f} seconds")
+        print(f"Successes: {total_successes}")
+        print(f"Failures: {total_failures}")
+        print(f"Errors: {total_errors}")
+
+        print("\nDetailed Results:")
+        for result, time_taken in self.results:
+            # todo: add time taken for each test
+            for test in result.successes:
+                print(f"PASS: {test._testMethodName}")
+            for test, _ in result.failures:
+                print(f"FAIL: {test._testMethodName}")
+            for test, _ in result.errors:
+                test_name = getattr(test, '_testMethodName', str(test))
+                print(f"ERROR: {test_name}")
+
+        if total_failures > 0 or total_errors > 0:
+            print("\nFailure and Error Details:")
+            for result, _ in self.results:
+                for test, traceback in result.failures:
+                    print(f"\nFAILURE: {test._testMethodName}")
+                    print(traceback)
+                for test, traceback in result.errors:
+                    test_name = getattr(test, '_testMethodName', str(test))
+                    print(f"\nERROR: {test_name}")
+                    print(traceback)
+        else:
+            print("\nAll tests passed successfully!")
+
+def test_name(name):
+    def decorator(func):
+        func.__name__ = name
+        return func
+    return decorator
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description="Run inference on a Sound-To-Text Model.")
-    parser.add_argument("--model_dir", type=str, required=True, help="Hugging Face model link or local_dir")
-    parser.add_argument("--model_save_dir", type=str, required=True, help="Local directory that model is saved")
+    parser.add_argument("--model_dir", type=str, default="jan-hq/Jan-Llama3-0708", help="Hugging Face model link or local_dir")
+    parser.add_argument("--max_length", type=int, default=1024, help="Maximum length of the output")
     parser.add_argument("--data_dir", type=str, required=True, help="Hugging Face model repository link or Data path")
+    parser.add_argument("--cache_dir", type=str, default=".", help="Absolute path to save the model and dataset")
     parser.add_argument("--mode", type=str, default="audio", help="Mode of the model (audio or text)")
     parser.add_argument("--num_rows", type=int, default=5, help="Number of dataset rows to process")
     parser.add_argument("--output_file", type=str, default="output/", help="Output file path")
@@ -29,23 +106,20 @@ def setUpClass(cls):
         cls.save_dir_output = f'{args.output_file}/{model_name}-{args.mode}-Result.csv'
         if not os.path.exists(args.output_file):
             os.makedirs(args.output_file)
-        cls.sampling_params = SamplingParams(temperature=0.0, max_tokens=1024, skip_special_tokens=False)
-        model_dir = ""
-        if os.path.exists(args.model_save_dir):
-            model_dir = args.model_save_dir
+        cls.sampling_params = SamplingParams(temperature=0.0, max_tokens=args.max_length, skip_special_tokens=False)
+        # Download model
+        model_save_dir = os.path.join(args.cache_dir, args.model_dir)
+        if not os.path.exists(model_save_dir):
+            snapshot_download(args.model_dir, local_dir=model_save_dir, max_workers=64)
         else:
-            # Download model
-            if not os.path.exists(args.model_dir):
-                snapshot_download(args.model_dir, local_dir=args.model_dir, max_workers=64)
-            else:
-                print(f"Found {args.model_dir}. Skipping download.")
-            model_dir = args.model_dir
+            print(f"Found {model_save_dir}. Skipping download.")
         # Model loading using vllm
-        cls.tokenizer = AutoTokenizer.from_pretrained(model_dir)
-        cls.llm = LLM(model_dir, tokenizer=model_dir)
+        cls.tokenizer = AutoTokenizer.from_pretrained(model_save_dir)
+        cls.llm = LLM(model_save_dir, tokenizer=model_save_dir)
         
         # Load dataset
-        cls.dataset = load_dataset(args.data_dir, cache_dir=".cache/")['train']
+        data_save_dir = os.path.join(args.cache_dir, args.data_dir)
+        cls.dataset = load_dataset(args.data_dir, split='train')
         cls.num_rows = min(args.num_rows, len(cls.dataset))
         cls.inference_results = []
         if args.mode == "audio":
@@ -88,6 +162,7 @@ def vllm_qna_inference(self, sample_id):
         
         
     #     return input_str, output_based_on_input, expected_output_str, output_token_ids
+    @test_name("Output validation (non-empty, correct type)")
     def test_model_output(self):
         for text_input_str, output_based_on_sound, expected_output_str, output_token_ids in self.inference_results:
             # Test 1: Check if output is not empty
@@ -104,7 +179,7 @@ def test_model_output(self):
             # output_words = set(output_based_on_sound.lower().split())
             # relevance_score = corpus_bleu(output_words, reference_words)
             # self.assertGreater(relevance_score, 0.3)
-
+    @test_name("Test Special Tokens Handling")
     def test_special_tokens(self):
         # Test 5: Check if special tokens are handled correctly
         special_tokens = [self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token]
@@ -119,12 +194,12 @@ def test_special_tokens(self):
     #     results = [self.inference_results[0][1] for _ in range(3)]  
     #     self.assertEqual(results[0], results[1])
     #     self.assertEqual(results[1], results[2])
-
+    @test_name("Test for NaN outputs")
     def test_no_nan_outputs(self):
         # Test 7: Check for NaN outputs
         for _, output, _, _ in self.inference_results:
             self.assertFalse(any(np.isnan(float(word)) for word in output.split() if word.replace('.', '').isdigit()))
-
+    @test_name("Test for EOS token generation")
     def test_eos_token_generation(self):
         # Test 8: Check if EOS token is generated
         for _, output_based_on_sound, _, output_token_ids in self.inference_results:
@@ -142,4 +217,6 @@ def test_eos_token_generation(self):
         
 
 if __name__ == "__main__":
-    unittest.main(argv=['first-arg-is-ignored'], exit=False)
\ No newline at end of file
+    runner = CustomTestRunner(stream=sys.stdout, verbosity=2)
+    unittest.main(argv=['first-arg-is-ignored'], exit=False, testRunner=runner)
+    runner.print_results()
\ No newline at end of file

From c916c759a81fb224b7a84376d71bf741e0c370a9 Mon Sep 17 00:00:00 2001
From: bachvudinh <bachvudinh02@gmail.com>
Date: Sat, 13 Jul 2024 11:08:04 +0000
Subject: [PATCH 3/3] correct readme

---
 tests/README.md | 294 ++++++------------------------------------------
 1 file changed, 35 insertions(+), 259 deletions(-)

diff --git a/tests/README.md b/tests/README.md
index 25af925..b902279 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -1,259 +1,35 @@
----
-datasets:
-- jan-hq/instruction-speech-v1
-language:
-- en
-license: apache-2.0
-tags:
-- sound language model
----
-
-## Model Details
-
-We have developed and released the family [Jan-Llama3](https://huggingface.co/collections/jan-hq/jan-llama3-668e4dad446c8736208dca4f). This family is natively understanding audio and text input.
-
-We continue to expand [Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) with sound understanding capabilities by leveraging 700M tokens [Instruction Speech v1](https://huggingface.co/datasets/jan-hq/instruction-speech-v1) dataset.
-
-**Model developers** Homebrew Research.
-
-**Input** Text and sound.
-
-**Output** Text.
-
-**Model Architecture** Llama-3.
-
-**Language(s):** English.
-
-## Intended Use
-
-**Intended Use Cases** This family is primarily intended for research applications. This version aims to further improve the LLM on sound understanding capabilities.
-
-**Out-of-scope** The use of Llama-3-Sound in any manner that violates applicable laws or regulations is strictly prohibited.
-
-## How to Get Started with the Model
-
-First, we need to convert the audio file to sound tokens
-
-```python
-import torch
-import torchaudio
-from encodec import EncodecModel
-from encodec.utils import convert_audio
-
-def audio_to_sound_tokens(audio_path, target_bandwidth=1.5, device="cuda"):
-    # Initialize Encodec
-    model = EncodecModel.encodec_model_24khz()
-    model.set_target_bandwidth(target_bandwidth)
-    model.to(device)
-
-    # Load and preprocess audio
-    wav, sr = torchaudio.load(audio_path)
-    wav = convert_audio(wav, sr, model.sample_rate, model.channels)
-    wav = wav.unsqueeze(0).to(device)
-
-    # Encode audio
-    with torch.no_grad():
-        encoded_frames = model.encode(wav)
-    codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1)
-
-    # Flatten codes
-    audio_code1, audio_code2 = codes[0][0], codes[0][1]
-    flatten_tokens = torch.stack((audio_code1, audio_code2), dim=1).flatten().tolist()
-
-    # Convert to sound tokens
-    result = ''.join(f'<|sound_{num}|>' for num in flatten_tokens)
-    return f'<|sound_start|>{result}<|sound_end|>'
-
-# Usage
-sound_tokens = audio_to_sound_tokens("/path/to/your/audio/file")
-```
-
-Then, we can inference the model the same as any other LLM.
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
-
-def setup_pipeline(model_path, use_4bit=True):
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-    model_kwargs = {"device_map": "auto"}
-    
-    if use_4bit:
-        model_kwargs["quantization_config"] = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.bfloat16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4",
-        )
-
-    model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
-
-    return pipeline("text-generation", model=model, tokenizer=tokenizer)
-
-def generate_text(pipe, messages, max_new_tokens=64, temperature=0.0, do_sample=False):
-    generation_args = {
-        "max_new_tokens": max_new_tokens,
-        "return_full_text": False,
-        "temperature": temperature,
-        "do_sample": do_sample,
-    }
-
-    output = pipe(messages, **generation_args)
-    return output[0]['generated_text']
-
-# Usage
-llm_path = "jan-hq/Jan-Llama3-0708"
-pipe = setup_pipeline(llm_path, use_4bit=True)
-messages = [
-    {"role": "user", "content": sound_tokens},
-]
-generated_text = generate_text(pipe, messages)
-print(generated_text)
-```
-
-## Training process
-**Training Metrics Image**: Below is a snapshot of the training loss curve visualized.
-
-![train_loss_curve/png](https://cdn-uploads.huggingface.co/production/uploads/65713d70f56f9538679e5a56/9bv-kpnqrTxaBhiYrVHN7.png)
-
-### Hardware
-
-**GPU Configuration**: Cluster of 8x NVIDIA H100-SXM-80GB.
-**GPU Usage**:
-  - **Continual Training**: 8 hours.
-
-### Training Arguments
-
-| Parameter                  | Continual Training      | 
-|----------------------------|-------------------------|
-| **Epoch**                  | 1                       | 
-| **Global batch size**      | 128                     | 
-| **Learning Rate**          | 5e-5                    | 
-| **Learning Scheduler**     | Cosine with warmup      | 
-| **Optimizer**              | [Adam-mini](https://arxiv.org/abs/2406.16793)               | 
-| **Warmup Ratio**           | 0.1                     | 
-| **Weight Decay**           | 0.01                    |
-| **beta1**                  | 0.9                     | 
-| **beta2**                  | 0.98                    | 
-| **epsilon**                | 1e-6                    | 
-| **Gradient Cliping**       | 1.0                     | 
-
-###
- Accelerate FSDP Config
-
-```
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: FSDP
-downcast_bf16: 'no'
-enable_cpu_affinity: true
-fsdp_config:
-  fsdp_activation_checkpointing: true
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_backward_prefetch: BACKWARD_PRE
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_forward_prefetch: false
-  fsdp_offload_params: false
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_state_dict_type: SHARDED_STATE_DICT
-  fsdp_sync_module_states: true
-  fsdp_use_orig_params: false
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 8
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-```
-
-## Examples
-
-1. Good example:
-```
-# Question:
-Does the following review have a positive or negative opinion of the movie?
-
-i thought the relationships were wonderful , 
-Options: A). negative; B). positive;
-
-# Expected Output:
-The review has a B). positive opinion about the movie because they think the relationships were wonderful.
-
-# Model's Output:
-The review is positive.
-```
-
-2. Misunderstanding example:
-```
-# Question:
-Translate the following sentence to Russian:
-6 Work in JOBS and careers after 40
-
-Russian:
-
-# Expected Output:
-6 Работать в РАБОТАХ и карьере после 40
-
-Russian (Cyrillic): 6 Работать в РАБОТАХ и карьере после 40
-
-Let me explain this to you like you're five years old: This sentence means that people can still work and have careers, even if they're older than 40 years old. The words are in a different language, called Russian.
-# Model's Output:
-В случае, когда вы хотите перевести фразу "The first step is to identify the language of the text." на русский язык, вы можете использовать следующую фразу: "Первый шаг - определить язык текста."
-```
-
-3. Off-tracked example:
-```
-# Quesion:
-Answer the following question: -  The roots of the growing plants start to break up the rock - The plant acids dissolve the rock - The rock is broken down into smaller pieces - Erosion begins.  What might be the first step of the process?
-Answer:
-
-# Expected Output:
-The first step of the process might be: The roots of the growing plants start to break up the rock.
-
-# Model's Output:
-Step 1: Identify the key elements in the premise and hypothesis.
-
-Premise: "A man in a blue shirt and a red hat is holding a sign."
-Key elements: man, blue shirt, red hat, holding, sign
-
-Hypothesis: "A man is holding a sign."
-Key elements: man, holding, sign
-
-Step 2: Compare the key elements in the premise and hypothesis.
-
-In the premise, we have a man wearing a blue shirt and a red hat, while in the hypothesis, we only have a man. The premise also mentions that the man is holding a sign, which is also mentioned in the hypothesis.
-
-Step 3: Determine if the hypothesis is entailed by the premise.
-
-Since the hypothesis states that a man is holding a sign, and the premise confirms that a man is holding a sign, we can conclude that the hypothesis is entailed by the premise. The additional information about the man's clothing in the premise does not contradict or negate the
-```
-
-Despite being undertrained, the model demonstrates an emerging grasp of sound-text semantics.
-
-
-## Citation Information
-
-**BibTeX:**
-
-```
-@article{Llama-3-Sound: Sound Instruction LLM 2024,
-  title={Llama-3-Sound},
-  author={Homebrew Research},
-  year=2024,
-  month=July},
-  url={https://huggingface.co/jan-hq/Jan-Llama3-0708}
-```
-
-## Acknowledgement
-
-- **[WhisperSpeech](https://github.com/collabora/WhisperSpeech)**
-
-- **[Encodec](https://github.com/facebookresearch/encodec)**
-
-- **[Meta-Llama-3-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct)**
\ No newline at end of file
+# research
+# Unit-test 
+## Installing Requirements and Running Tests
+1. Install the required packages:
+    ```bash
+    pip install -r requirements.txt
+    ```
+2. Run the test suite:
+    ```bash
+    python test_case.py --model_dir "jan-hq/Jan-Llama3-0708" \\
+                        --max_length 1024 \\
+                        --data_dir "jan-hq/instruction-speech-conversation-test" \\
+                        --mode "audio" \\
+                        --num_rows 5 \\ 
+    ```
+## Test Configuration
+
+- The test suite uses the following model and dataset:
+- Model: "jan-hq/Jan-Llama3-0708"
+- Tokenizer: "jan-hq/llama-3-sound-init"
+- Dataset: "jan-hq/instruction-speech-conversation-test"
+
+## What the Tests Cover
+
+1. Output validation (non-empty, correct type)
+2. Token ID validation
+3. Input-output relevance using BLEU
+4. Special token handling
+5. Numerical stability (NaN checks)
+6. Check if EOS token are unique and at the end of the generated ids
+
+## Continuous Integration
+
+- This test suite can be integrated into CI/CD pipelines.
+- model download and inference can take significant time.
\ No newline at end of file