linkedin · lancerts · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 15, 2024
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -9,8 +9,11 @@ This is an optional section; is there anything specific that reviewers should be
 ## Testing Done
 <!--- This is a required section; please describe how this change was tested. --->
 
+<!-- 
 Complete the following tasks before sending your PR, and replace `[ ]` with
-`[x]` to indicate you have done them.
+`[x]` to indicate you have done them. 
+-->
+
 - [ ] run `make test` to ensure correctness
 - [ ] run `make checkstyle` to ensure code style
 - [ ] run `make test-convergence` to ensure convergence
diff --git a/Makefile b/Makefile
@@ -2,7 +2,7 @@
 
 # Command to run pytest for correctness tests
 test:
-	pytest --disable-warnings -v test/ --ignore=test/convergence
+	pytest --disable-warnings test/ --ignore=test/convergence
 
 
 # Command to run flake8 (code style check), isort (import ordering), and black (code formatting)
@@ -18,4 +18,4 @@ checkstyle:
 # Command to run pytest for convergence tests
 # We have to explicitly set HF_DATASETS_OFFLINE=1, or dataset will silently try to send metrics and timeout (80s) https://github.com/huggingface/datasets/blob/37a603679f451826cfafd8aae00738b01dcb9d58/src/datasets/load.py#L286
 test-convergence:
-	HF_DATASETS_OFFLINE=1 pytest --disable-warnings -v -s test/convergence
+	HF_DATASETS_OFFLINE=1 pytest --disable-warnings test/convergence
diff --git a/src/liger_kernel/ops/rms_norm.py b/src/liger_kernel/ops/rms_norm.py
@@ -107,8 +107,8 @@ def forward(ctx, X, W, eps):
         n_rows, n_cols = X.shape
         BLOCK_SIZE, num_warps = calculate_settings(n_cols)
 
-        Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device="cuda")
-        r = torch.empty(n_rows, dtype=X.dtype, device="cuda")
+        Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
+        r = torch.empty(n_rows, dtype=X.dtype, device=X.device)
 
         # Check constraints.
         assert (

diff --git a/test/convergence/test_mini_models.py b/test/convergence/test_mini_models.py
@@ -206,6 +206,7 @@ def run_mini_model(
     [
         ("mini_llama3", 32, 1e-4, torch.float32, 1e-8, 1e-5, 1e-4, 1e-5, 2e-3, 1e-5),
         ("mini_llama3", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 1e-1, 1e-5, 1e-2, 1e-5),
+        # TODO: torch 2.5.0 nightly breaks mixtral test, but torch 2.3.0 works fine
         ("mini_mixtral", 32, 1e-4, torch.float32, 1e-8, 1e-5, 1e-3, 1e-5, 8e-3, 1e-5),
         ("mini_mixtral", 32, 1e-4, torch.bfloat16, 1e-8, 1e-5, 2.0, 1e-5, 1e-2, 1e-5),
         ("mini_mistral", 32, 1e-4, torch.float32, 1e-8, 1e-5, 5e-3, 1e-5, 5e-3, 1e-5),

diff --git a/test/transformers/test_geglu.py b/test/transformers/test_geglu.py
@@ -12,6 +12,8 @@
 )
 SLEEP_SECONDS = 0.1
 
+# TODO (yun dai): triton 3.0.0 breaks geglu due to tanh module issue
+
 
 @pytest.mark.parametrize(
     "bsz, seq_len, hidden_size, intermediate_size",