1616import gc
1717import logging
1818import os
19- import sys
2019import unittest
2120
2221import pytest
23- import torchao
24- import transformers
2522from executorch .extension .pybindings .portable_lib import ExecuTorchModule
26- from packaging .version import parse
2723from transformers import AutoConfig , AutoTokenizer
2824from transformers .testing_utils import slow
2925
3329
3430
3531os .environ ["TOKENIZERS_PARALLELISM" ] = "false"
32+
3633is_ci = os .environ .get ("GITHUB_ACTIONS" ) == "true"
37- is_linux_ci = sys .platform .startswith ("linux" ) and os .environ .get ("GITHUB_ACTIONS" ) == "true"
3834
3935
4036class ExecuTorchModelIntegrationTest (unittest .TestCase ):
@@ -44,47 +40,36 @@ def __init__(self, *args, **kwargs):
4440 @slow
4541 @pytest .mark .run_slow
4642 @pytest .mark .skipif (
47- is_linux_ci
48- or parse (transformers .__version__ ) < parse ("4.52.0" )
49- or parse (torchao .__version__ ) < parse ("0.11.0" ),
50- reason = "Only available on transformers >= 4.52.0 and torchao >= 0.11.0. OOM on linux runner." ,
43+ is_ci ,
44+ reason = "Test Phi-4-mini (3.8B) will require runner to be configured with larger RAM" ,
5145 )
52- def test_phi4_text_generation_with_custom_sdpa_and_kv_cache_8da4w_8we (self ):
46+ def test_phi4_text_generation (self ):
5347 model_id = "microsoft/Phi-4-mini-instruct"
5448 config = AutoConfig .from_pretrained (model_id )
5549 # NOTE: To make the model exportable we need to set the rope scaling to default to avoid hitting
5650 # the data-dependent control flow in _longrope_frequency_update. Alternatively, we can rewrite
5751 # that function to avoid the data-dependent control flow.
5852 if hasattr (config , "rope_scaling" ) and config .rope_scaling is not None :
5953 config .rope_scaling ["type" ] = "default"
60- model = ExecuTorchModelForCausalLM .from_pretrained (
61- model_id ,
62- recipe = "xnnpack" ,
63- config = config ,
64- attn_implementation = "custom_sdpa" ,
65- use_custom_kv_cache = True ,
66- ** {"qlinear" : True , "qembeeding" : True },
67- )
54+ model = ExecuTorchModelForCausalLM .from_pretrained (model_id , recipe = "xnnpack" , config = config )
6855 self .assertIsInstance (model , ExecuTorchModelForCausalLM )
6956 self .assertIsInstance (model .model , ExecuTorchModule )
7057
7158 tokenizer = AutoTokenizer .from_pretrained (model_id )
7259 generated_text = model .text_generation (
7360 tokenizer = tokenizer ,
7461 prompt = "My favourite condiment is " ,
75- max_seq_len = 64 ,
62+ max_seq_len = 32 ,
7663 )
7764 logging .info (f"\n Generated text:\n \t { generated_text } " )
65+ generated_tokens = tokenizer (generated_text , return_tensors = "pt" ).input_ids
7866
79- if not is_ci :
80- generated_tokens = tokenizer (generated_text , return_tensors = "pt" ).input_ids
81-
82- # Free memory before loading eager for quality check
83- del model
84- del tokenizer
85- gc .collect ()
67+ # Free memory before loading eager for quality check
68+ del model
69+ del tokenizer
70+ gc .collect ()
8671
87- self .assertTrue (check_causal_lm_output_quality (model_id , generated_tokens ))
72+ self .assertTrue (check_causal_lm_output_quality (model_id , generated_tokens ))
8873
8974 @slow
9075 @pytest .mark .run_slow
0 commit comments