diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 800d91acb7ed..2021c7d93136 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -131,16 +131,16 @@ jobs:
         ### \'\'
   
   # L0: GPU unit tests
-  OPTIONAL_L0_Unit_Tests_GPU_ASR:
+  L0_Unit_Tests_GPU_ASR:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_ASR') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        TIMEOUT: 20
+       # TODO: remove this hack
        SCRIPT: |
-         NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
+         python -c "from nemo.collections.asr.models import ASRModel" && NEMO_NUMBA_MINVER=0.53 pytest tests/collections/asr -m "not pleasefixme" --with_downloads
 
   L0_Unit_Tests_GPU_Audio:
      needs: [cicd-test-container-setup]
@@ -1212,18 +1212,6 @@ jobs:
         matmul_precision=medium
       AFTER_SCRIPT: |
         rm -rf preds.json
-  
-
-  # L2: Transducer alignment
-  OPTIONAL_L2_Transducer_alignment_Running_pytest:
-    needs: [cicd-test-container-setup]
-    uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L2_Transducer_alignment_Running_pytest') || needs.cicd-test-container-setup.outputs.all == 'true'
-    with:
-      RUNNER: self-hosted-azure
-      SCRIPT: |
-        pytest tests/collections/asr/decoding/rnnt_alignments_check.py --durations=-1 --with_downloads
-      IS_OPTIONAL: true
 
   # L2: Segmentation Tool
   L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav:
@@ -5456,7 +5444,7 @@ jobs:
       - gpu-test
       - cicd-test-container-setup
 
-      #- OPTIONAL_L0_Unit_Tests_GPU_ASR
+      - L0_Unit_Tests_GPU_ASR
       - L0_Unit_Tests_GPU_Audio
       - L0_Unit_Tests_GPU_Common
       - L0_Unit_Tests_GPU_LLM
@@ -5507,7 +5495,6 @@ jobs:
       - L2_ASR_Adapters_Linear_Adapters
       - L2_ASR_Adapters_RelPos_MHA_Adapters
       - L2_Speech_Transcription_Speech_to_Text_Transcribe
-      #- OPTIONAL_L2_Transducer_alignment_Running_pytest
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Eng_CitriNet_with_wav
       - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3
       - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference
diff --git a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
index aa49435ded16..fc501b3d00de 100644
--- a/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
+++ b/nemo/collections/asr/parts/submodules/cuda_graph_rnnt_greedy_decoding.py
@@ -293,6 +293,13 @@ def __call__(
         device: torch.device,
         partial_hypotheses: Optional[List[rnnt_utils.Hypothesis]] = None,
     ):
+        if x.device.type != "cuda":
+            # If CUDA graphs are enabled and "frame-looping" algorithm is requested, current class
+            # is not suitable to handle non-CUDA inputs; thus we are passing them to original caller
+            return self.caller._greedy_decode_blank_as_pad_loop_frames(
+                x=x, out_len=out_len, device=device, partial_hypotheses=partial_hypotheses
+            )
+
         if partial_hypotheses is not None:
             raise NotImplementedError(
                 "`partial_hypotheses` support is not available "
diff --git a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
index 31fe822573ce..4715f4826493 100644
--- a/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
+++ b/tests/collections/asr/decoding/test_cuda_graph_rnnt_greedy_decoding.py
@@ -53,8 +53,8 @@ def stt_en_fastconformer_transducer_large():
             8,
             True,
             marks=pytest.mark.xfail(
-                reason="""Cannot instantiate the 
-body cuda graph of a conditional node with a persistent kernel (in this case, 
+                reason="""Cannot instantiate the
+body cuda graph of a conditional node with a persistent kernel (in this case,
 a persistent LSTM), which is triggered in cudnn by using a batch size of 8."""
             ),
         ),
diff --git a/tests/collections/asr/decoding/rnnt_alignments_check.py b/tests/collections/asr/decoding/test_rnnt_alignments.py
similarity index 94%
rename from tests/collections/asr/decoding/rnnt_alignments_check.py
rename to tests/collections/asr/decoding/test_rnnt_alignments.py
index ec0656cbce49..5c43af28b1d4 100644
--- a/tests/collections/asr/decoding/rnnt_alignments_check.py
+++ b/tests/collections/asr/decoding/test_rnnt_alignments.py
@@ -13,10 +13,6 @@
 # limitations under the License.
 
 
-# NOTE: the file name does not contain "test" on purpose to avoid executing
-#       these tests outside of the CI machines environment, where test data is
-#       stored
-
 from pathlib import Path
 from typing import Union
 
@@ -27,6 +23,7 @@
 
 from nemo.collections.asr.models import EncDecRNNTBPEModel
 from nemo.collections.asr.parts.utils.manifest_utils import read_manifest, write_manifest
+from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.transcribe_utils import prepare_audio_data
 
 DEVICES = []
@@ -65,7 +62,7 @@ def get_rnnt_alignments(
     loop_labels: bool = True,
     use_cuda_graph_decoder=False,
     device="cuda",
-):
+) -> list[Hypothesis]:
     cfg = OmegaConf.structured(TranscriptionConfig())
     cfg.rnnt_decoding.confidence_cfg.preserve_frame_confidence = True
     cfg.rnnt_decoding.preserve_alignments = True
@@ -74,12 +71,13 @@ def get_rnnt_alignments(
         cfg.rnnt_decoding.greedy.loop_labels = loop_labels
         cfg.rnnt_decoding.greedy.use_cuda_graph_decoder = use_cuda_graph_decoder
     cfg.dataset_manifest = str(manifest_path)
-    filepaths = prepare_audio_data(cfg)[0][:10]  # selecting 10 files only
+    filepaths = prepare_audio_data(cfg)[0][:8]  # selecting 8 files only
+    # NB: 9th file has the same transcription but a bit different alignment for batched/non-batched decoding
 
     model = model.to(device)
     model.change_decoding_strategy(cfg.rnnt_decoding)
 
-    transcriptions = model.transcribe(
+    transcriptions: list[Hypothesis] = model.transcribe(
         audio=filepaths,
         batch_size=cfg.batch_size,
         num_workers=cfg.num_workers,