pytorch
diff --git a/‎examples/decoder_only_model.py‎
Lines changed: 4 additions & 4 deletions b/‎examples/decoder_only_model.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎test/run_tests.sh‎
Lines changed: 2 additions & 1 deletion b/‎test/run_tests.sh‎
Lines changed: 2 additions & 1 deletion
@@ -7,16 +7,16 @@
 from torch import nn
 
 
-# the default config is intentionally kept low to make it runable on a sigle tpu v2-8 core.
+# the default config is intentionally kept low to make it runnable on a single tpu v2-8 core.
 @dataclass
 class DecoderOnlyConfig:
   hidden_size: int = 1024
   num_hidden_layers: int = 2
   num_attention_heads: int = 8
   num_key_value_heads: int = 4
-  intermediate_size = 32 * 1024
-  vocab_size = 3200
-  use_flash_attention = False
+  intermediate_size: int = 32 * 1024
+  vocab_size: int = 3200
+  use_flash_attention: bool = False
 
 
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 
@@ -208,7 +208,8 @@ function run_xla_op_tests1 {
 function run_xla_op_tests2 {
   run_test "$CDIR/pjrt/test_dtypes.py"
   run_test "$CDIR/test_while_loop.py"
-  run_test "$CDIR/test_scan.py"
+  run_test "$CDIR/scan/test_scan.py"
+  run_test "$CDIR/scan/test_scan_layers.py"
   run_test "$CDIR/test_autocast.py"
   run_test "$CDIR/eager/test_eager.py"
   run_test "$CDIR/eager/test_eager_with_xla_compile.py"