diff --git a/.github/runners/Dockerfile b/.github/runners/Dockerfile index 6f5827d..fe31363 100644 --- a/.github/runners/Dockerfile +++ b/.github/runners/Dockerfile @@ -1,4 +1,7 @@ -FROM docker.io/pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime +FROM docker.io/pytorch/pytorch:2.4.0-cuda12.1-cudnn9-runtime + +# Set the MKL_SERVICE_FORCE_INTEL environment variable +ENV MKL_SERVICE_FORCE_INTEL=1 RUN apt-get update \ && apt-get install -y --no-install-recommends \ diff --git a/.github/workflows/test-branch.yml b/.github/workflows/test-branch.yml index a02e536..760ffaa 100644 --- a/.github/workflows/test-branch.yml +++ b/.github/workflows/test-branch.yml @@ -9,7 +9,7 @@ on: model_id: description: 'Model ID on huggingface, for example: jan-hq/Jan-Llama3-0708' required: true - default: jan-hq/Jan-Llama3-0708 + default: homebrewltd/llama3-s-2024-07-08 type: string dataset_id: description: 'Dataset ID on huggingface, for example: jan-hq/instruction-speech-conversation-test' @@ -21,6 +21,16 @@ on: required: false default: "--mode audio --num_rows 5" type: string + run_benchmark: + description: 'Run benchmark test' + required: false + default: true + type: boolean + run audio_benchmark: + description: 'Run audio benchmark test' + required: false + default: true + type: boolean jobs: run-test: @@ -28,6 +38,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 + with: + submodules: 'recursive' - name: Install dependencies working-directory: ./tests @@ -38,4 +50,26 @@ jobs: - name: Run tests working-directory: ./tests run: | - python3 test_case.py --model_dir ${{ github.event.inputs.model_id || 'jan-hq/Jan-Llama3-0708' }} --data_dir ${{ github.event.inputs.dataset_id || 'jan-hq/instruction-speech-conversation-test' }} ${{ github.event.inputs.extra_args || '--mode audio --num_rows 5' }} \ No newline at end of file + python3 test_case.py --model_dir ${{ github.event.inputs.model_id || 'jan-hq/Jan-Llama3-0708' }} --data_dir ${{ github.event.inputs.dataset_id || 'jan-hq/instruction-speech-conversation-test' }} ${{ github.event.inputs.extra_args || '--mode audio --num_rows 5' }} + + - name: Install benchmark dependencies + if: ${{ github.event.inputs.run_benchmark == 'true' }} + run: | + cd lm-evaluation-harness + pip3 install -e . + pip3 install lm_eval[vllm] + echo "$HOME/.local/bin" >> $GITHUB_PATH + + - name: Run benchmark + if: ${{ github.event.inputs.run_benchmark == 'true' }} + run: | + cd lm-evaluation-harness + chmod +x ./run_benchmark.sh + ./run_benchmark.sh ${{ github.event.inputs.model_id }} + + - name: Upload benchmark results + if: ${{ github.event.inputs.run_benchmark == 'true' }} + uses: actions/upload-artifact@v2 + with: + name: benchmark-results + path: ./lm-evaluation-harness/benchmark_results/*.json \ No newline at end of file diff --git a/.github/workflows/test-models.yml b/.github/workflows/test-models.yml index a51d9d3..456c098 100644 --- a/.github/workflows/test-models.yml +++ b/.github/workflows/test-models.yml @@ -3,9 +3,9 @@ on: workflow_dispatch: inputs: model_id: - description: 'Model ID on huggingface, for example: jan-hq/Jan-Llama3-0708' + description: 'Model ID on huggingface, for example: homebrewltd/llama3-s-2024-07-08' required: true - default: jan-hq/Jan-Llama3-0708 + default: homebrewltd/llama3-s-2024-07-08 type: string dataset_id: description: 'Dataset ID on huggingface, for example: jan-hq/instruction-speech-conversation-test' @@ -20,7 +20,12 @@ on: run_benchmark: description: 'Run benchmark test' required: false - default: false + default: true + type: boolean + run audio_benchmark: + description: 'Run audio benchmark test' + required: false + default: true type: boolean jobs: @@ -29,6 +34,8 @@ jobs: steps: - name: Checkout uses: actions/checkout@v4 + with: + submodules: 'recursive' - name: Install dependencies working-directory: ./tests @@ -39,19 +46,20 @@ jobs: - name: Run tests working-directory: ./tests run: | - python3 test_case.py --model_dir ${{ github.event.inputs.model_id }} --data_dir ${{ github.event.inputs.dataset_id }} ${{ github.event.inputs.extra_args }} + python3 test_case.py --model_dir ${{ github.event.inputs.model_id || 'jan-hq/Jan-Llama3-0708' }} --data_dir ${{ github.event.inputs.dataset_id || 'jan-hq/instruction-speech-conversation-test' }} ${{ github.event.inputs.extra_args || '--mode audio --num_rows 5' }} - name: Install benchmark dependencies if: ${{ github.event.inputs.run_benchmark == 'true' }} - working-directory: ./lm-evaluation-harness run: | + cd lm-evaluation-harness pip3 install -e . pip3 install lm_eval[vllm] + echo "$HOME/.local/bin" >> $GITHUB_PATH - name: Run benchmark if: ${{ github.event.inputs.run_benchmark == 'true' }} - working-directory: ./lm-evaluation-harness run: | + cd lm-evaluation-harness chmod +x ./run_benchmark.sh ./run_benchmark.sh ${{ github.event.inputs.model_id }} @@ -60,4 +68,4 @@ jobs: uses: actions/upload-artifact@v2 with: name: benchmark-results - path: ./lm-evaluation-harness/benchmark_results/*.json \ No newline at end of file + path: ./lm-evaluation-harness/benchmark_results/**/*.json \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index b6b8212..7fa598a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "lm-evaluation-harness"] path = lm-evaluation-harness url = git@github.com:homebrewltd/lm-evaluation-harness.git +[submodule "AudioBench"] + path = AudioBench + url = git@github.com:homebrewltd/AudioBench.git diff --git a/AudioBench b/AudioBench new file mode 160000 index 0000000..5e2b856 --- /dev/null +++ b/AudioBench @@ -0,0 +1 @@ +Subproject commit 5e2b8565b34ef522457ccfb0c99f60bbdd1a51ea diff --git a/lm-evaluation-harness b/lm-evaluation-harness index 58b0b06..1996e4e 160000 --- a/lm-evaluation-harness +++ b/lm-evaluation-harness @@ -1 +1 @@ -Subproject commit 58b0b0605f8f6a8f0d71dde7901d67f9a1759d6e +Subproject commit 1996e4e3e63adf2458ff0368e8f36a439ef3979f diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..83483fc --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,22 @@ +import pytest + +def pytest_addoption(parser): + parser.addoption("--model_dir", type=str, default="jan-hq/Jan-Llama3-0708", help="Hugging Face model link or local_dir") + parser.addoption("--max_length", type=int, default=1024, help="Maximum length of the output") + parser.addoption("--data_dir", type=str, required=True, help="Hugging Face model repository link or Data path") + parser.addoption("--cache_dir", type=str, default=".", help="Absolute path to save the model and dataset") + parser.addoption("--mode", type=str, default="audio", help="Mode of the model (audio or text)") + parser.addoption("--num_rows", type=int, default=5, help="Number of dataset rows to process") + parser.addoption("--output_file", type=str, default="output/", help="Output file path") + +@pytest.fixture(scope="session") +def custom_args(request): + return { + "model_dir": request.config.getoption("--model_dir"), + "max_length": request.config.getoption("--max_length"), + "data_dir": request.config.getoption("--data_dir"), + "cache_dir": request.config.getoption("--cache_dir"), + "mode": request.config.getoption("--mode"), + "num_rows": request.config.getoption("--num_rows"), + "output_file": request.config.getoption("--output_file"), + } \ No newline at end of file diff --git a/tests/requirements.txt b/tests/requirements.txt index e41cf80..d9e8afa 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,7 +1,9 @@ +torch==2.4.0 datasets==2.20.0 -torch==2.3.0 -transformers -vllm +transformers>=4.43.0 +vllm==0.5.4 huggingface_hub==0.23.4 pandas==2.2.2 -nltk \ No newline at end of file +nltk +pytest +pytest-cov diff --git a/tests/test_case.py b/tests/test_case.py index 624b701..dbcaca5 100644 --- a/tests/test_case.py +++ b/tests/test_case.py @@ -114,8 +114,8 @@ def setUpClass(cls): else: print(f"Found {model_save_dir}. Skipping download.") # Model loading using vllm - cls.tokenizer = AutoTokenizer.from_pretrained(model_dir) - cls.llm = LLM(model_dir, tokenizer=model_dir, gpu_memory_utilization=0.3) + cls.tokenizer = AutoTokenizer.from_pretrained(model_save_dir) + cls.llm = LLM(model_save_dir, tokenizer=model_save_dir) # Load dataset data_save_dir = os.path.join(args.cache_dir, args.data_dir) diff --git a/tests/unit_test.py b/tests/unit_test.py new file mode 100644 index 0000000..c761dd4 --- /dev/null +++ b/tests/unit_test.py @@ -0,0 +1,102 @@ +import pytest +from huggingface_hub import snapshot_download +from vllm import LLM, SamplingParams +from transformers import AutoTokenizer +from datasets import load_dataset +import pandas as pd +import numpy as np +import os +import time + +@pytest.fixture(scope="module") +def model_setup(custom_args): + args = custom_args + model_name = args.model_dir.split("/")[-1] + save_dir_output = f'{args.output_file}/{model_name}-{args.mode}-Result.csv' + if not os.path.exists(args.output_file): + os.makedirs(args.output_file) + + sampling_params = SamplingParams(temperature=0.0, max_tokens=args.max_length, skip_special_tokens=False) + + model_save_dir = os.path.join(args.cache_dir, args.model_dir) + if not os.path.exists(model_save_dir): + snapshot_download(args.model_dir, local_dir=model_save_dir, max_workers=64) + else: + print(f"Found {model_save_dir}. Skipping download.") + + tokenizer = AutoTokenizer.from_pretrained(model_save_dir) + llm = LLM(model_save_dir, tokenizer=model_save_dir, gpu_memory_utilization=-1) + + data_save_dir = os.path.join(args.cache_dir, args.data_dir) + dataset = load_dataset(args.data_dir, split='train') + num_rows = min(args.num_rows, len(dataset)) + + return args, tokenizer, llm, dataset, num_rows, sampling_params, save_dir_output + +@pytest.fixture(scope="module") +def inference_results(model_setup): + args, tokenizer, llm, dataset, num_rows, sampling_params, _ = model_setup + results = [] + + def vllm_sound_inference(sample_id): + sound_messages = dataset[sample_id]['sound_convo'][0] + expected_output_messages = dataset[sample_id]['sound_convo'][1] + sound_input_str = tokenizer.apply_chat_template([sound_messages], tokenize=False, add_generation_prompt=True) + text_input_str = dataset[sample_id]['prompt'] + expected_output_str = tokenizer.apply_chat_template([expected_output_messages], tokenize=False) + + outputs = llm.generate(sound_input_str, sampling_params) + output_based_on_sound = outputs[0].outputs[0].text + output_token_ids = outputs[0].outputs[0].token_ids + + return text_input_str, output_based_on_sound, expected_output_str, output_token_ids + + def vllm_qna_inference(sample_id): + text_input_str = dataset[sample_id]['prompt'] + expected_answer_str = dataset[sample_id]['answer'] + question_str = tokenizer.apply_chat_template([text_input_str], tokenize=False, add_generation_prompt=True) + outputs = llm.generate(question_str, sampling_params) + output_based_on_question = outputs[0].outputs[0].text + output_token_ids = outputs[0].outputs[0].token_ids + + return text_input_str, output_based_on_question, expected_answer_str, output_token_ids + if args.mode == "audio": + for i in range(num_rows): + results.append(vllm_sound_inference(i)) + elif args.mode == "text": + for i in range(num_rows): + results.append(vllm_qna_inference(i)) + + df_results = pd.DataFrame(results, columns=['input', 'output', 'expected_output', 'output_token_ids']) + df_results.to_csv(save_dir_output, index=False, encoding='utf-8') + print(f"Successfully saved in {save_dir_output}") + + return results + +def test_model_output(inference_results): + for text_input_str, output_based_on_sound, expected_output_str, output_token_ids in inference_results: + assert len(output_based_on_sound) > 0, "Output should not be empty" + assert isinstance(output_based_on_sound, str), "Output should be a string" + assert all(token >= 0 for token in output_token_ids), "Output tokens should be valid" + +def test_special_tokens(model_setup, inference_results): + _, tokenizer, _, _, _, _, _ = model_setup + special_tokens = [tokenizer.bos_token, tokenizer.eos_token, tokenizer.pad_token] + for token in special_tokens: + if token: + encoded = tokenizer.encode(token) + assert encoded[0] != -100, f"Special token {token} should not be ignored" + +def test_no_nan_outputs(inference_results): + for _, output, _, _ in inference_results: + assert not any(np.isnan(float(word)) for word in output.split() if word.replace('.', '').isdigit()), "Output should not contain NaN values" + +def test_eos_token_generation(model_setup, inference_results): + _, tokenizer, _, _, _, _, _ = model_setup + eos_token_id = tokenizer.eos_token_id + for _, _, _, output_token_ids in inference_results: + assert eos_token_id in output_token_ids, "EOS token not found in the generated output" + assert output_token_ids[-1] == eos_token_id, "EOS token is not at the end of the sequence" + assert output_token_ids.count(eos_token_id) == 1, f"Expected 1 EOS token, but found {output_token_ids.count(eos_token_id)}" + +# Additional tests can be added here \ No newline at end of file