|
| 1 | +# Llama-3.1-8B-Instruct Accuracy Test |
| 2 | + <div> |
| 3 | + <strong>vLLM version:</strong> vLLM: v0.7.3, vLLM Ascend: v0.7.3 <br> |
| 4 | + </div> |
| 5 | + <div> |
| 6 | + <strong>Software Environment:</strong> CANN: 8.1.0, PyTorch: 2.5.1, torch-npu: 2.5.1 <br> |
| 7 | + </div> |
| 8 | + <div> |
| 9 | + <strong>Hardware Environment</strong>: Atlas A2 Series <br> |
| 10 | + </div> |
| 11 | + <div> |
| 12 | + <strong>Datasets</strong>: ceval-valid,mmlu,gsm8k <br> |
| 13 | + </div> |
| 14 | + <div> |
| 15 | + <strong>Command</strong>: |
| 16 | + |
| 17 | + ```bash |
| 18 | + export MODEL_AEGS='meta-llama/Llama-3.1-8B-Instruct, max_model_len=4096,dtype=auto,tensor_parallel_size=2,gpu_memory_utilization=0.6' |
| 19 | +lm_eval --model vllm --modlel_args $MODEL_ARGS --tasks ceval-valid,gsm8k \ |
| 20 | +--apply_chat_template --fewshot_as_multiturn --num_fewshot 5 --batch_size 1 |
| 21 | + ``` |
| 22 | + </div> |
| 23 | + <div> </div> |
| 24 | + |
| 25 | +| Task | Filter | n-shot | Metric | Value | Stderr | |
| 26 | +|-----------------------|-------:|-------:|----------|--------:|-------:| |
| 27 | +| ceval-valid | none | 5 | acc | ↑ 0.5483 | ± 0.0132 | |
| 28 | +| mmlu | none | 5 | acc | ↑ 0.6867 | ± 0.0037 | |
| 29 | +| gsm8k | flexible-extract | 5 | exact_match | ↑ 0.7286 | ± 0.0122 | |
| 30 | +<details> |
| 31 | +<summary>ceval-valid</summary> |
| 32 | + |
| 33 | +| Task | Filter | n-shot | Metric | Value | Stderr | |
| 34 | +|-----------------------|-------:|-------:|----------|--------:|-------:| |
| 35 | +| ceval-valid | none | 5 | acc | ↑ 0.5483 | ± 0.0132 | |
| 36 | +| - ceval-valid_accountant | none | 5 | acc | ↑ 0.4898 | ± 0.0722 | |
| 37 | +| - ceval-valid_advanced_mathematics | none | 5 | acc | ↑ 0.5263 | ± 0.1177 | |
| 38 | +| - ceval-valid_art_studies | none | 5 | acc | ↑ 0.5455 | ± 0.0880 | |
| 39 | +| - ceval-valid_basic_medicine | none | 5 | acc | ↑ 0.6842 | ± 0.1096 | |
| 40 | +| - ceval-valid_business_administration | none | 5 | acc | ↑ 0.6061 | ± 0.0864 | |
| 41 | +| - ceval-valid_chinese_language_and_literature | none | 5 | acc | ↑ 0.4348 | ± 0.1057 | |
| 42 | +| - ceval-valid_civil_servant | none | 5 | acc | ↑ 0.4894 | ± 0.0737 | |
| 43 | +| - ceval-valid_clinical_medicine | none | 5 | acc | ↑ 0.5455 | ± 0.1087 | |
| 44 | +| - ceval-valid_college_chemistry | none | 5 | acc | ↑ 0.4167 | ± 0.1028 | |
| 45 | +| - ceval-valid_college_economics | none | 5 | acc | ↑ 0.4545 | ± 0.0678 | |
| 46 | +| - ceval-valid_college_physics | none | 5 | acc | ↑ 0.4737 | ± 0.1177 | |
| 47 | +| - ceval-valid_college_programming | none | 5 | acc | ↑ 0.5946 | ± 0.0818 | |
| 48 | +| - ceval-valid_computer_architecture | none | 5 | acc | ↑ 0.5714 | ± 0.1107 | |
| 49 | +| - ceval-valid_computer_network | none | 5 | acc | ↑ 0.7895 | ± 0.0961 | |
| 50 | +| - ceval-valid_discrete_mathematics | none | 5 | acc | ↑ 0.4375 | ± 0.1281 | |
| 51 | +| - ceval-valid_education_science | none | 5 | acc | ↑ 0.7241 | ± 0.0845 | |
| 52 | +| - ceval-valid_electrical_engineer | none | 5 | acc | ↑ 0.4324 | ± 0.0826 | |
| 53 | +| - ceval-valid_environmental_impact_assessment_engineer | none | 5 | acc | ↑ 0.5484 | ± 0.0909 | |
| 54 | +| - ceval-valid_fire_engineer | none | 5 | acc | ↑ 0.4839 | ± 0.0912 | |
| 55 | +| - ceval-valid_high_school_biology | none | 5 | acc | ↑ 0.5263 | ± 0.1177 | |
| 56 | +| - ceval-valid_high_school_chemistry | none | 5 | acc | ↑ 0.4737 | ± 0.1177 | |
| 57 | +| - ceval-valid_high_school_chinese | none | 5 | acc | ↑ 0.2105 | ± 0.0961 | |
| 58 | +| - ceval-valid_high_school_geography | none | 5 | acc | ↑ 0.6842 | ± 0.1096 | |
| 59 | +| - ceval-valid_high_school_history | none | 5 | acc | ↑ 0.6500 | ± 0.1094 | |
| 60 | +| - ceval-valid_high_school_mathematics | none | 5 | acc | ↑ 0.0000 | ± 0.0000 | |
| 61 | +| - ceval-valid_high_school_physics | none | 5 | acc | ↑ 0.3158 | ± 0.1096 | |
| 62 | +| - ceval-valid_high_school_politics | none | 5 | acc | ↑ 0.5789 | ± 0.1164 | |
| 63 | +| - ceval-valid_ideological_and_moral_cultivation | none | 5 | acc | ↑ 0.8947 | ± 0.0723 | |
| 64 | +| - ceval-valid_law | none | 5 | acc | ↑ 0.4583 | ± 0.1039 | |
| 65 | +| - ceval-valid_legal_professional | none | 5 | acc | ↑ 0.3913 | ± 0.1041 | |
| 66 | +| - ceval-valid_logic | none | 5 | acc | ↑ 0.5000 | ± 0.1091 | |
| 67 | +| - ceval-valid_mao_zedong_thought | none | 5 | acc | ↑ 0.5000 | ± 0.1043 | |
| 68 | +| - ceval-valid_marxism | none | 5 | acc | ↑ 0.6842 | ± 0.1096 | |
| 69 | +| - ceval-valid_metrology_engineer | none | 5 | acc | ↑ 0.5833 | ± 0.1028 | |
| 70 | +| - ceval-valid_middle_school_biology | none | 5 | acc | ↑ 0.7143 | ± 0.1010 | |
| 71 | +| - ceval-valid_middle_school_chemistry | none | 5 | acc | ↑ 0.8500 | ± 0.0819 | |
| 72 | +| - ceval-valid_middle_school_geography | none | 5 | acc | ↑ 0.5833 | ± 0.1486 | |
| 73 | +| - ceval-valid_middle_school_history | none | 5 | acc | ↑ 0.5455 | ± 0.1087 | |
| 74 | +| - ceval-valid_middle_school_mathematics | none | 5 | acc | ↑ 0.3684 | ± 0.1137 | |
| 75 | +| - ceval-valid_middle_school_physics | none | 5 | acc | ↑ 0.6316 | ± 0.1137 | |
| 76 | +| - ceval-valid_middle_school_politics | none | 5 | acc | ↑ 0.8095 | ± 0.0878 | |
| 77 | +| - ceval-valid_modern_chinese_history | none | 5 | acc | ↑ 0.5217 | ± 0.1065 | |
| 78 | +| - ceval-valid_operating_system | none | 5 | acc | ↑ 0.6316 | ± 0.1137 | |
| 79 | +| - ceval-valid_physician | none | 5 | acc | ↑ 0.5918 | ± 0.0709 | |
| 80 | +| - ceval-valid_plant_protection | none | 5 | acc | ↑ 0.7727 | ± 0.0914 | |
| 81 | +| - ceval-valid_probability_and_statistics | none | 5 | acc | ↑ 0.3889 | ± 0.1182 | |
| 82 | +| - ceval-valid_professional_tour_guide | none | 5 | acc | ↑ 0.6207 | ± 0.0917 | |
| 83 | +| - ceval-valid_sports_science | none | 5 | acc | ↑ 0.6316 | ± 0.1137 | |
| 84 | +| - ceval-valid_tax_accountant | none | 5 | acc | ↑ 0.3878 | ± 0.0703 | |
| 85 | +| - ceval-valid_teacher_qualification | none | 5 | acc | ↑ 0.7955 | ± 0.0615 | |
| 86 | +| - ceval-valid_urban_and_rural_planner | none | 5 | acc | ↑ 0.5217 | ± 0.0745 | |
| 87 | +| - ceval-valid_veterinary_medicine | none | 5 | acc | ↑ 0.6087 | ± 0.1041 | |
| 88 | +</details> |
| 89 | +<details> |
| 90 | +<summary>mmlu</summary> |
| 91 | + |
| 92 | +| Task | Filter | n-shot | Metric | Value | Stderr | |
| 93 | +|-----------------------|-------:|-------:|----------|--------:|-------:| |
| 94 | +| mmlu | none | 5 | acc | ↑ 0.6867 | ± 0.0037 | |
| 95 | +| - humanities | none | 5 | acc | ↑ 0.6495 | ± 0.0067 | |
| 96 | +| - formal_logic | none | 5 | acc | ↑ 0.5714 | ± 0.0443 | |
| 97 | +| - high_school_european_history | none | 5 | acc | ↑ 0.7636 | ± 0.0332 | |
| 98 | +| - high_school_us_history | none | 5 | acc | ↑ 0.8186 | ± 0.0270 | |
| 99 | +| - high_school_world_history | none | 5 | acc | ↑ 0.8439 | ± 0.0236 | |
| 100 | +| - international_law | none | 5 | acc | ↑ 0.8347 | ± 0.0339 | |
| 101 | +| - jurisprudence | none | 5 | acc | ↑ 0.7778 | ± 0.0402 | |
| 102 | +| - logical_fallacies | none | 5 | acc | ↑ 0.8098 | ± 0.0308 | |
| 103 | +| - moral_disputes | none | 5 | acc | ↑ 0.7630 | ± 0.0229 | |
| 104 | +| - moral_scenarios | none | 5 | acc | ↑ 0.5687 | ± 0.0166 | |
| 105 | +| - philosophy | none | 5 | acc | ↑ 0.7363 | ± 0.0250 | |
| 106 | +| - prehistory | none | 5 | acc | ↑ 0.7562 | ± 0.0239 | |
| 107 | +| - professional_law | none | 5 | acc | ↑ 0.5111 | ± 0.0128 | |
| 108 | +| - world_religions | none | 5 | acc | ↑ 0.8363 | ± 0.0284 | |
| 109 | +| - other | none | 5 | acc | ↑ 0.7448 | ± 0.0075 | |
| 110 | +| - business_ethics | none | 5 | acc | ↑ 0.7200 | ± 0.0451 | |
| 111 | +| - clinical_knowledge | none | 5 | acc | ↑ 0.7509 | ± 0.0266 | |
| 112 | +| - college_medicine | none | 5 | acc | ↑ 0.6821 | ± 0.0355 | |
| 113 | +| - global_facts | none | 5 | acc | ↑ 0.3900 | ± 0.0490 | |
| 114 | +| - human_aging | none | 5 | acc | ↑ 0.6951 | ± 0.0309 | |
| 115 | +| - management | none | 5 | acc | ↑ 0.8155 | ± 0.0384 | |
| 116 | +| - marketing | none | 5 | acc | ↑ 0.8974 | ± 0.0199 | |
| 117 | +| - medical_genetics | none | 5 | acc | ↑ 0.8200 | ± 0.0386 | |
| 118 | +| - miscellaneous | none | 5 | acc | ↑ 0.8378 | ± 0.0132 | |
| 119 | +| - nutrition | none | 5 | acc | ↑ 0.8039 | ± 0.0227 | |
| 120 | +| - professional_accounting | none | 5 | acc | ↑ 0.5532 | ± 0.0297 | |
| 121 | +| - professional_medicine | none | 5 | acc | ↑ 0.7721 | ± 0.0255 | |
| 122 | +| - virology | none | 5 | acc | ↑ 0.5241 | ± 0.0389 | |
| 123 | +| - social sciences | none | 5 | acc | ↑ 0.7797 | ± 0.0073 | |
| 124 | +| - econometrics | none | 5 | acc | ↑ 0.6053 | ± 0.0460 | |
| 125 | +| - high_school_geography | none | 5 | acc | ↑ 0.8485 | ± 0.0255 | |
| 126 | +| - high_school_government_and_politics | none | 5 | acc | ↑ 0.9171 | ± 0.0199 | |
| 127 | +| - high_school_macroeconomics | none | 5 | acc | ↑ 0.6923 | ± 0.0234 | |
| 128 | +| - high_school_microeconomics | none | 5 | acc | ↑ 0.7647 | ± 0.0276 | |
| 129 | +| - high_school_psychology | none | 5 | acc | ↑ 0.8697 | ± 0.0144 | |
| 130 | +| - human_sexuality | none | 5 | acc | ↑ 0.8015 | ± 0.0350 | |
| 131 | +| - professional_psychology | none | 5 | acc | ↑ 0.7271 | ± 0.0180 | |
| 132 | +| - public_relations | none | 5 | acc | ↑ 0.6818 | ± 0.0446 | |
| 133 | +| - security_studies | none | 5 | acc | ↑ 0.7224 | ± 0.0287 | |
| 134 | +| - sociology | none | 5 | acc | ↑ 0.8358 | ± 0.0262 | |
| 135 | +| - us_foreign_policy | none | 5 | acc | ↑ 0.8900 | ± 0.0314 | |
| 136 | +| - stem | none | 5 | acc | ↑ 0.5940 | ± 0.0084 | |
| 137 | +| - abstract_algebra | none | 5 | acc | ↑ 0.3900 | ± 0.0490 | |
| 138 | +| - anatomy | none | 5 | acc | ↑ 0.6741 | ± 0.0405 | |
| 139 | +| - astronomy | none | 5 | acc | ↑ 0.7566 | ± 0.0349 | |
| 140 | +| - college_biology | none | 5 | acc | ↑ 0.8264 | ± 0.0317 | |
| 141 | +| - college_chemistry | none | 5 | acc | ↑ 0.4700 | ± 0.0502 | |
| 142 | +| - college_computer_science | none | 5 | acc | ↑ 0.5400 | ± 0.0501 | |
| 143 | +| - college_mathematics | none | 5 | acc | ↑ 0.3900 | ± 0.0490 | |
| 144 | +| - college_physics | none | 5 | acc | ↑ 0.4314 | ± 0.0493 | |
| 145 | +| - computer_security | none | 5 | acc | ↑ 0.8000 | ± 0.0402 | |
| 146 | +| - conceptual_physics | none | 5 | acc | ↑ 0.6170 | ± 0.0318 | |
| 147 | +| - electrical_engineering | none | 5 | acc | ↑ 0.6552 | ± 0.0396 | |
| 148 | +| - elementary_mathematics | none | 5 | acc | ↑ 0.4735 | ± 0.0257 | |
| 149 | +| - high_school_biology | none | 5 | acc | ↑ 0.8097 | ± 0.0223 | |
| 150 | +| - high_school_chemistry | none | 5 | acc | ↑ 0.6207 | ± 0.0341 | |
| 151 | +| - high_school_computer_science | none | 5 | acc | ↑ 0.7300 | ± 0.0446 | |
| 152 | +| - high_school_mathematics | none | 5 | acc | ↑ 0.4222 | ± 0.0301 | |
| 153 | +| - high_school_physics | none | 5 | acc | ↑ 0.4636 | ± 0.0407 | |
| 154 | +| - high_school_statistics | none | 5 | acc | ↑ 0.6065 | ± 0.0333 | |
| 155 | +| - machine_learning | none | 5 | acc | ↑ 0.5446 | ± 0.0473 | |
| 156 | +</details> |
| 157 | +<details> |
| 158 | +<summary>gsm8k</summary> |
| 159 | + |
| 160 | +| Task | Filter | n-shot | Metric | Value | Stderr | |
| 161 | +|-----------------------|-------:|-------:|----------|--------:|-------:| |
| 162 | +| gsm8k | flexible-extract | 5 | exact_match | ↑ 0.7286 | ± 0.0122 | |
| 163 | +</details> |
0 commit comments