janhq · bachvudinh · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile
@@ -1,4 +1,7 @@
-FROM docker.io/pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+FROM docker.io/pytorch/pytorch:2.4.0-cuda12.1-cudnn9-runtime
+
+# Set the MKL_SERVICE_FORCE_INTEL environment variable
+ENV MKL_SERVICE_FORCE_INTEL=1
 
 RUN apt-get update \
   && apt-get install -y --no-install-recommends \

diff --git a/.github/workflows/test-branch.yml b/.github/workflows/test-branch.yml
@@ -9,7 +9,7 @@ on:
       model_id:
         description: 'Model ID on huggingface, for example: jan-hq/Jan-Llama3-0708'
         required: true
-        default: jan-hq/Jan-Llama3-0708
+        default: homebrewltd/llama3-s-2024-07-08
         type: string
       dataset_id:
         description: 'Dataset ID on huggingface, for example: jan-hq/instruction-speech-conversation-test'
@@ -21,13 +21,25 @@ on:
         required: false
         default: "--mode audio --num_rows 5"
         type: string
+      run_benchmark:
+        description: 'Run benchmark test'
+        required: false
+        default: true
+        type: boolean
+      run audio_benchmark:
+        description: 'Run audio benchmark test'
+        required: false
+        default: true
+        type: boolean
 
 jobs:
   run-test:
     runs-on: research
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
 
       - name: Install dependencies
         working-directory: ./tests
@@ -38,4 +50,26 @@ jobs:
       - name: Run tests
         working-directory: ./tests
         run: |
-          python3 test_case.py --model_dir ${{ github.event.inputs.model_id || 'jan-hq/Jan-Llama3-0708' }} --data_dir ${{ github.event.inputs.dataset_id || 'jan-hq/instruction-speech-conversation-test' }} ${{ github.event.inputs.extra_args || '--mode audio --num_rows 5' }}
+          python3 test_case.py --model_dir ${{ github.event.inputs.model_id || 'jan-hq/Jan-Llama3-0708' }} --data_dir ${{ github.event.inputs.dataset_id || 'jan-hq/instruction-speech-conversation-test' }} ${{ github.event.inputs.extra_args || '--mode audio --num_rows 5' }}
+
+      - name: Install benchmark dependencies
+        if: ${{ github.event.inputs.run_benchmark == 'true' }}
+        run: |
+          cd lm-evaluation-harness
+          pip3 install -e .
+          pip3 install lm_eval[vllm]
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
+
+      - name: Run benchmark
+        if: ${{ github.event.inputs.run_benchmark == 'true' }}
+        run: |
+          cd lm-evaluation-harness
+          chmod +x ./run_benchmark.sh
+          ./run_benchmark.sh ${{ github.event.inputs.model_id }}
+
+      - name: Upload benchmark results
+        if: ${{ github.event.inputs.run_benchmark == 'true' }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: benchmark-results
+          path: ./lm-evaluation-harness/benchmark_results/*.json
diff --git a/.github/workflows/test-models.yml b/.github/workflows/test-models.yml
@@ -3,9 +3,9 @@ on:
   workflow_dispatch:
     inputs:
       model_id:
-        description: 'Model ID on huggingface, for example: jan-hq/Jan-Llama3-0708'
+        description: 'Model ID on huggingface, for example: homebrewltd/llama3-s-2024-07-08'
         required: true
-        default: jan-hq/Jan-Llama3-0708
+        default: homebrewltd/llama3-s-2024-07-08
         type: string
       dataset_id:
         description: 'Dataset ID on huggingface, for example: jan-hq/instruction-speech-conversation-test'
@@ -20,7 +20,12 @@ on:
       run_benchmark:
         description: 'Run benchmark test'
         required: false
-        default: false
+        default: true
+        type: boolean
+      run audio_benchmark:
+        description: 'Run audio benchmark test'
+        required: false
+        default: true
         type: boolean
 
 jobs:
@@ -29,6 +34,8 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+        with:
+          submodules: 'recursive'
 
       - name: Install dependencies
         working-directory: ./tests
@@ -39,19 +46,20 @@ jobs:
       - name: Run tests
         working-directory: ./tests
         run: |
-          python3 test_case.py --model_dir ${{ github.event.inputs.model_id }} --data_dir ${{ github.event.inputs.dataset_id }} ${{ github.event.inputs.extra_args }}
+          python3 test_case.py --model_dir ${{ github.event.inputs.model_id || 'jan-hq/Jan-Llama3-0708' }} --data_dir ${{ github.event.inputs.dataset_id || 'jan-hq/instruction-speech-conversation-test' }} ${{ github.event.inputs.extra_args || '--mode audio --num_rows 5' }}
 
       - name: Install benchmark dependencies
         if: ${{ github.event.inputs.run_benchmark == 'true' }}
-        working-directory: ./lm-evaluation-harness
         run: |
+          cd lm-evaluation-harness
           pip3 install -e .
           pip3 install lm_eval[vllm]
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
 
       - name: Run benchmark
         if: ${{ github.event.inputs.run_benchmark == 'true' }}
-        working-directory: ./lm-evaluation-harness
         run: |
+          cd lm-evaluation-harness
           chmod +x ./run_benchmark.sh
           ./run_benchmark.sh ${{ github.event.inputs.model_id }}
 
@@ -60,4 +68,4 @@ jobs:
         uses: actions/upload-artifact@v2
         with:
           name: benchmark-results
-          path: ./lm-evaluation-harness/benchmark_results/*.json
+          path: ./lm-evaluation-harness/benchmark_results/**/*.json
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "lm-evaluation-harness"]
 	path = lm-evaluation-harness
 	url = git@github.com:homebrewltd/lm-evaluation-harness.git
+[submodule "AudioBench"]
+	path = AudioBench
+	url = git@github.com:homebrewltd/AudioBench.git
diff --git a/AudioBench b/AudioBench
diff --git a/lm-evaluation-harness b/lm-evaluation-harness
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,22 @@
+import pytest
+
+def pytest_addoption(parser):
+    parser.addoption("--model_dir", type=str, default="jan-hq/Jan-Llama3-0708", help="Hugging Face model link or local_dir")
+    parser.addoption("--max_length", type=int, default=1024, help="Maximum length of the output")
+    parser.addoption("--data_dir", type=str, required=True, help="Hugging Face model repository link or Data path")
+    parser.addoption("--cache_dir", type=str, default=".", help="Absolute path to save the model and dataset")
+    parser.addoption("--mode", type=str, default="audio", help="Mode of the model (audio or text)")
+    parser.addoption("--num_rows", type=int, default=5, help="Number of dataset rows to process")
+    parser.addoption("--output_file", type=str, default="output/", help="Output file path")
+
+@pytest.fixture(scope="session")
+def custom_args(request):
+    return {
+        "model_dir": request.config.getoption("--model_dir"),
+        "max_length": request.config.getoption("--max_length"),
+        "data_dir": request.config.getoption("--data_dir"),
+        "cache_dir": request.config.getoption("--cache_dir"),
+        "mode": request.config.getoption("--mode"),
+        "num_rows": request.config.getoption("--num_rows"),
+        "output_file": request.config.getoption("--output_file"),
+    }
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -1,7 +1,9 @@
+torch==2.4.0
 datasets==2.20.0
-torch==2.3.0
-transformers
-vllm
+transformers>=4.43.0
+vllm==0.5.4
 huggingface_hub==0.23.4
 pandas==2.2.2
-nltk
+nltk
+pytest
+pytest-cov
diff --git a/tests/test_case.py b/tests/test_case.py
@@ -114,8 +114,8 @@ def setUpClass(cls):
         else:
             print(f"Found {model_save_dir}. Skipping download.")
         # Model loading using vllm
-        cls.tokenizer = AutoTokenizer.from_pretrained(model_dir)
-        cls.llm = LLM(model_dir, tokenizer=model_dir, gpu_memory_utilization=0.3)
+        cls.tokenizer = AutoTokenizer.from_pretrained(model_save_dir)
+        cls.llm = LLM(model_save_dir, tokenizer=model_save_dir)
 
         # Load dataset
         data_save_dir = os.path.join(args.cache_dir, args.data_dir)

diff --git a/tests/unit_test.py b/tests/unit_test.py
@@ -0,0 +1,102 @@
+import pytest
+from huggingface_hub import snapshot_download
+from vllm import LLM, SamplingParams
+from transformers import AutoTokenizer
+from datasets import load_dataset
+import pandas as pd
+import numpy as np
+import os
+import time
+
+@pytest.fixture(scope="module")
+def model_setup(custom_args):
+    args = custom_args
+    model_name = args.model_dir.split("/")[-1]
+    save_dir_output = f'{args.output_file}/{model_name}-{args.mode}-Result.csv'
+    if not os.path.exists(args.output_file):
+        os.makedirs(args.output_file)
+
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=args.max_length, skip_special_tokens=False)
+
+    model_save_dir = os.path.join(args.cache_dir, args.model_dir)
+    if not os.path.exists(model_save_dir):
+        snapshot_download(args.model_dir, local_dir=model_save_dir, max_workers=64)
+    else:
+        print(f"Found {model_save_dir}. Skipping download.")
+
+    tokenizer = AutoTokenizer.from_pretrained(model_save_dir)
+    llm = LLM(model_save_dir, tokenizer=model_save_dir, gpu_memory_utilization=-1)
+
+    data_save_dir = os.path.join(args.cache_dir, args.data_dir)
+    dataset = load_dataset(args.data_dir, split='train')
+    num_rows = min(args.num_rows, len(dataset))
+
+    return args, tokenizer, llm, dataset, num_rows, sampling_params, save_dir_output
+
+@pytest.fixture(scope="module")
+def inference_results(model_setup):
+    args, tokenizer, llm, dataset, num_rows, sampling_params, _ = model_setup
+    results = []
+
+    def vllm_sound_inference(sample_id):
+        sound_messages = dataset[sample_id]['sound_convo'][0]
+        expected_output_messages = dataset[sample_id]['sound_convo'][1]
+        sound_input_str = tokenizer.apply_chat_template([sound_messages], tokenize=False, add_generation_prompt=True)
+        text_input_str = dataset[sample_id]['prompt']
+        expected_output_str = tokenizer.apply_chat_template([expected_output_messages], tokenize=False)
+
+        outputs = llm.generate(sound_input_str, sampling_params)
+        output_based_on_sound = outputs[0].outputs[0].text
+        output_token_ids = outputs[0].outputs[0].token_ids
+
+        return text_input_str, output_based_on_sound, expected_output_str, output_token_ids
+
+    def vllm_qna_inference(sample_id):
+        text_input_str = dataset[sample_id]['prompt']
+        expected_answer_str = dataset[sample_id]['answer']
+        question_str = tokenizer.apply_chat_template([text_input_str], tokenize=False, add_generation_prompt=True)
+        outputs = llm.generate(question_str, sampling_params)
+        output_based_on_question = outputs[0].outputs[0].text
+        output_token_ids = outputs[0].outputs[0].token_ids
+
+        return text_input_str, output_based_on_question, expected_answer_str, output_token_ids
+    if args.mode == "audio":
+        for i in range(num_rows):
+            results.append(vllm_sound_inference(i))
+    elif args.mode == "text":
+        for i in range(num_rows):
+            results.append(vllm_qna_inference(i))
+
+    df_results = pd.DataFrame(results, columns=['input', 'output', 'expected_output', 'output_token_ids'])
+    df_results.to_csv(save_dir_output, index=False, encoding='utf-8')
+    print(f"Successfully saved in {save_dir_output}")
+
+    return results
+
+def test_model_output(inference_results):
+    for text_input_str, output_based_on_sound, expected_output_str, output_token_ids in inference_results:
+        assert len(output_based_on_sound) > 0, "Output should not be empty"
+        assert isinstance(output_based_on_sound, str), "Output should be a string"
+        assert all(token >= 0 for token in output_token_ids), "Output tokens should be valid"
+
+def test_special_tokens(model_setup, inference_results):
+    _, tokenizer, _, _, _, _, _ = model_setup
+    special_tokens = [tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token]
+    for token in special_tokens:
+        if token:
+            encoded = tokenizer.encode(token)
+            assert encoded[0] != -100, f"Special token {token} should not be ignored"
+
+def test_no_nan_outputs(inference_results):
+    for _, output, _, _ in inference_results:
+        assert not any(np.isnan(float(word)) for word in output.split() if word.replace('.', '').isdigit()), "Output should not contain NaN values"
+
+def test_eos_token_generation(model_setup, inference_results):
+    _, tokenizer, _, _, _, _, _ = model_setup
+    eos_token_id = tokenizer.eos_token_id
+    for _, _, _, output_token_ids in inference_results:
+        assert eos_token_id in output_token_ids, "EOS token not found in the generated output"
+        assert output_token_ids[-1] == eos_token_id, "EOS token is not at the end of the sequence"
+        assert output_token_ids.count(eos_token_id) == 1, f"Expected 1 EOS token, but found {output_token_ids.count(eos_token_id)}"
+
+# Additional tests can be added here